Matlab Files
[C1] MAIN File
%This function would read several data and tells in which class
%each set of data belongs
function finalproject = pj
%Read data of letter "a" sound
f1=wavread('vc_a',22050);
%Transform the data using Fast Fourier Transform function
F1=fft(f1,11025);
%Compute to obtain the Normalized Power Spectrum Density of the
%transformed data
Pf1 = F1.* conj(F1)/11025;
Pf1 = transpose(Pf1); %Transpose
Pf1 = Pf1(1:2206); %Only the first 2206 sample data needed
f2=wavread('vc_a2',22050);
F2=fft(f2,11025);
Pf2 = F2.* conj(F2)/11025;
Pf2 = transpose(Pf2);
Pf2 = Pf2(1:2206);
f3=wavread('vc_a3',22050);
F3=fft(f3,11025);
Pf3 = F3.* conj(F3)/11025;
Pf3 = transpose(Pf3);
Pf3 = Pf3(1:2206);
%Compute the average of the three
Pf_AVG1 = (Pf1 + Pf2 + Pf3) / 3;
%Plot PSD of sound "a"
f = 100000*(0:2205)/22050; %Frequency range
mel = frq2mel(f);
figure(1);
plot(mel,Pf_AVG1);
title('The PSD of the Letter "a" Sound');
xlabel('Frequency (Mel)');
ylabel('Power');
%-----------------------------------------------------------------------
%Read data of letter "e" and "o" sounds also
f4=wavread('vc_e',22050);
F4=fft(f4,11025);
Pf4 = F4.* conj(F4)/11025;
Pf4 = transpose(Pf4);
Pf4 = Pf4(1:2206);
f5=wavread('vc_e2',22050);
F5=fft(f5,11025);
Pf5 = F5.* conj(F5)/11025;
Pf5 = transpose(Pf5);
Pf5 = Pf5(1:2206);
f6=wavread('vc_e3',22050);
F6=fft(f6,11025);
Pf6 = F6.* conj(F6)/11025;
Pf6 = transpose(Pf6);
Pf6 = Pf6(1:2206);
Pf_AVG2 = (Pf4 + Pf5 + Pf6) / 3;
figure(2);
plot(mel,Pf_AVG2);
title('The PSD of the Letter "e" Sound');
xlabel('Frequency (Mel)');
ylabel('Power');
f7=wavread('vc_o',22050);
F7=fft(f7,11025);
Pf7 = F7.* conj(F7)/11025;
Pf7 = transpose(Pf7);
Pf7 = Pf7(1:2206);
f8=wavread('vc_o2',22050);
F8=fft(f8,11025);
Pf8 = F8.* conj(F8)/11025;
Pf8 = transpose(Pf8);
Pf8 = Pf8(1:2206);
f9=wavread('vc_o3',22050);
F9=fft(f9,11025);
Pf9 = F9.* conj(F9)/11025;
Pf9 = transpose(Pf9);
Pf9 = Pf9(1:2206);
Pf_AVG3 = (Pf7 + Pf8 + Pf9) / 3;
figure(3);
plot(mel,Pf_AVG3);
title('The PSD of the Letter "o" Sound');
xlabel('Frequency (Mel)');
ylabel('Power');
%Combine Averages PSD's of all three sounds "a", "e" and "o"
%to one matrix
Pf_Total(1,:) = Pf_AVG1; %"a"
Pf_Total(2,:) = Pf_AVG2; %"e"
Pf_Total(3,:) = Pf_AVG3; %"o"
%Train by creating a feed-forward back-propagation network
net1 = newff(minmax(mel),[80,3],{'tansig' 'purelin'},'trainlm','learnp');
net1.trainParam.epochs = 30;
net1 = train(net1,mel,Pf_Total);
%Simulate to obtain outputs
y1 = sim(net1,mel);
%plot the outputs
figure(4);
plot(mel,y1);
title('Plot of the Outputs of the Trained Network');
xlabel('Frequency (Mel)');
ylabel('Power');
%------------------------------------------------------------------
%Read the test data (word "pay") and do the same as the trained data
s1=wavread('pl_pay',22050);
S1=fft(s1,11025);
Ps1 = S1.* conj(S1)/11025;
Ps1 = transpose(Ps1);
Ps1 = Ps1(1:2206);
s2=wavread('pl_pay2',22050);
S2=fft(s2,11025);
Ps2 = S2.* conj(S2)/11025;
Ps2 = transpose(Ps2);
Ps2 = Ps2(1:2206);
Ps_AVG = (Ps1 + Ps2) / 2;
figure(5);
plot(mel,Ps_AVG);
title('The PSD of the word "Pay"');
xlabel('Frequency (Mel)');
ylabel('Power');
%Create another network and train using the test data
net2 = newff(minmax(mel),[80,1],{'tansig' 'losgig'},'trainlm','learnp');
net2.trainParam.epochs = 30;
net2 = train(net2,mel,Ps_AVG);
%Simulate the output
y2 = sim(net2,mel);
figure(6);
plot(mel,y2)
title('Plot of the Output of the Trained Network with the Word "Pay"');
xlabel('Frequency (Mel)');
ylabel('Power');
%--------------------------------------------------------------------------
%Compare the each output of the first network by the output of the
%second network to test in which sound class the word "pay" belongs
%Compute the norm of the error between outputs in the first network
%and the output of the second network
'Result of the Test with the Word "Pay"'
A1 = sqrt(abs(y1(1,:) - y2) * transpose(abs(y1(1,:) - y2)))
E1 = sqrt(abs(y1(2,:) - y2) * transpose(abs(y1(2,:) - y2)))
O1 = sqrt(abs(y1(3,:) - y2) * transpose(abs(y1(3,:) - y2)))
%Check in which class the word "pay" belongs
if A1 <>
if A1 <>
'The word "pay" belongs to the "a" sound group'
end
elseif E1 <>
if E1 <>
'The word "pay" belongs to the "e" sound group'
end
end
if O1 <>
if O1 <>
'The word "so" belongs to the "o" sound group'
end
end
%---------------------------------------------------------------
%Test again with the test word "so"
'Another Test with the Word "So"'
s1=wavread('fc_so',22050);
S1=fft(s1,11025);
Ps1 = S1.* conj(S1)/11025;
Ps1 = transpose(Ps1);
Ps1 = Ps1(1:2206);
s2=wavread('fc_so2',22050);
S2=fft(s2,11025);
Ps2 = S2.* conj(S2)/11025;
Ps2 = transpose(Ps2);
Ps2 = Ps2(1:2206);
Ps_AVG = (Ps1 + Ps2) / 2;
figure(7);
plot(mel,Ps_AVG);
title('The PSD of the word "So"');
xlabel('Frequency (Mel)');
ylabel('Power');
net3 = newff(minmax(mel),[80,1],{'tansig' 'purelin'},'trainlm','learnp');
net3.trainParam.epochs = 30;
net3 = train(net2,f,Ps_AVG);
y2 = sim(net3,mel);
figure(8);
plot(mel,y2);
title('Plot of the Output of the Trained Network with the Word "So"');
xlabel('Frequency (Mel)');
ylabel('Power');
'Result of the Test with the Word "So"'
A2 = sqrt(abs(y1(1,:) - y2) * transpose(abs(y1(1,:) - y2)))
E2 = sqrt(abs(y1(2,:) - y2) * transpose(abs(y1(2,:) - y2)))
O2 = sqrt(abs(y1(3,:) - y2) * transpose(abs(y1(3,:) - y2)))
if A2 <>
if A2 <>
'The word "so" belongs to the "a" sound group'
end
elseif E2 <>
if E2 <>
'The word "so" belongs to the "e" sound group'
end
end
if O2 <>
if O2 <>
'The word "so" belongs to the "o" sound group'
end
end
[C2] Frequency to Mel File
function mel = frq2mel(frq)
%FRQ2ERB Convert Hertz to Mel frequency scale MEL=(FRQ)
% mel = frq2mel(frq) converts a vector of frequencies (in Hz)
% to the corresponding values on the Mel scale which corresponds
% to the perceived pitch of a tone
% The relationship between mel and frq is given by:
%
% m = ln(1 + f/700) * 1000 / ln(1+1000/700)
%
% This means that m(1000) = 1000
%
% References:
%
% [1] S. S. Stevens & J. Volkman "The relation of pitch to
% frequency", American J of Psychology, V 53, p329 1940
% [2] C. G. M. Fant, "Acoustic description & classification
% of phonetic units", Ericsson Tchnics, No 1 1959
% (reprinted in "Speech Sounds & Features", MIT Press 1973)
% [3] S. B. Davis & P. Mermelstein, "Comparison of parametric
% representations for monosyllabic word recognition in
% continuously spoken sentences", IEEE ASSP, V 28,
% pp 357-366 Aug 1980
% [4] J. R. Deller Jr, J. G. Proakis, J. H. L. Hansen,
% "Discrete-Time Processing of Speech Signals", p380,
% Macmillan 1993
% [5] HTK Reference Manual p73
% Copyright (C) Mike Brookes 1998
%
% Last modified Fri Apr 3 14:57:14 1998
%
% VOICEBOX home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This program is free software; you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation; either version 2 of the License, or
% (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You can obtain a copy of the GNU General Public License from
% ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0 or by writing to
% Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
mel = log(1+frq/700)*1127.01048;