labels = load

labels = load(‘MNISTnumLabels5000.txt’);
U(:,785) = labels;

randomTbl = U(randperm(size(U, 1)), :); %Randomize data
trainingTbl = randomTbl(1:4000,:);
testingTbl = randomTbl(4001:5000,:);

trainingInput = trainingTbl(:,1:784);
%trainingOutputBeforeFormatting = trainingTbl(:,785);

a = 18345 ;
iwant = num2str(a) – ‘0’ ;

learningRate = .1;
momentum = .1;

trainingOutput = trainingTbl(:,1:784);
% for i = 1:size(trainingOutputBeforeFormatting)
% switch trainingOutputBeforeFormatting(i,1)
% case 0
% trainingOutput(i,:) = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0;
% case 1
% trainingOutput(i,:) = 0, 1, 0, 0, 0, 0, 0, 0, 0, 0;
% case 2
% trainingOutput(i,:) = 0, 0, 1, 0, 0, 0, 0, 0, 0, 0;
% case 3
% trainingOutput(i,:) = 0, 0, 0, 1, 0, 0, 0, 0, 0, 0;
% case 4
% trainingOutput(i,:) = 0, 0, 0, 0, 1, 0, 0, 0, 0, 0;
% case 5
% trainingOutput(i,:) = 0, 0, 0, 0, 0, 1, 0, 0, 0, 0;
% case 6
% trainingOutput(i,:) = 0, 0, 0, 0, 0, 0, 1, 0, 0, 0;
% case 7
% trainingOutput(i,:) = 0, 0, 0, 0, 0, 0, 0, 1, 0, 0;
% case 8
% trainingOutput(i,:) = 0, 0, 0, 0, 0, 0, 0, 0, 1, 0;
% case 9
% trainingOutput(i,:) = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1;
% otherwise
% end
% end

hiddenLayers = 1;
hiddenNeurons = 100;
numberOfLayers = hiddenLayers + 2; %include input and output layer

LayerSizes = 784,hiddenNeurons, 784; %TODO: Set code up for layer size later

epochs = 0;

P,N = size(trainingInput);
Pd,M = size(trainingOutput);

if N ~= LayerSizes(1) || M ~= LayerSizes(end)
e = sprintf(‘Dimensions of input (%d) does not match input layer (%d)’,N,LayerSizes(1));
error(‘backprop:invalidLayerSize’, e);
elseif M ~= LayerSizes(end)
e = sprintf(‘Dimensions of output (%d) does not match output layer (%d)’,M,LayerSizes(end));
error(‘backprop:invalidLayerSize’, e);
end

w = cell(numberOfLayers-1,1); % a weight matrix between each layer

for i=1:numberOfLayers-2
w{i} = 1 – 2.*rand(LayerSizes(i+1),LayerSizes(i)+1) ; zeros(1,LayerSizes(i)+1);
end
w{end} = 1 – 2.*rand(LayerSizes(end),LayerSizes(end-1)+1);

%ACTIVATION
a = cell(numberOfLayers,1); % one activation matrix for each layer
a{1} = trainingInput ones(P,1); % a{1} is the input + ‘1’ for the bias node activation
% a{1} remains the same throught the computation
for i=2:numberOfLayers-1
a{i} = ones(P,LayerSizes(i)+1); % inner layers include a bias node (P-by-Nodes+1)
end
a{end} = ones(P,LayerSizes(end)); % no bias node at output layer

%NET
net = cell(numberOfLayers-1,1); % one net matrix for each layer exclusive input
for i=1:numberOfLayers-2;
net{i} = ones(P,LayerSizes(i+1)+1); % affix bias node
end
net{end} = ones(P,LayerSizes(end));

%BATCH MODE and Momentum
prev_dw = cell(numberOfLayers-1,1);
sum_dw = cell(numberOfLayers-1,1);
for i=1:numberOfLayers-1
prev_dw{i} = zeros(size(w{i})); % prev_dw starts at 0
sum_dw{i} = zeros(size(w{i}));
end

%Loop
while (epochs < 30000)
% FEEDFORWARD PHASE: calculate input/output off each layer for all samples
for i=1:numberOfLayers-1
net{i} = a{i} * w{i}'; % compute inputs to current layer

% compute activation(output of current layer, for all layers
% exclusive the output, the last node is the bias node and
% its activation is 1
if i 1
delta = (1+a{i}) .* (1-a{i}) .* (delta*w{i});
end
end

% update the prev_w, weight matrices, epoch count and mse
for i=1:numberOfLayers-1
% we have the sum of the delta weights, divide through by the
% number of samples and add momentum * delta weight at (t-1)
% finally, update the weight matrices
prev_dw{i} = (sum_dw{i} ./ P) + (momentum * prev_dw{i});
w{i} = w{i} + prev_dw{i};
end
epochs = epochs + 1;
%fprintf(‘%d : %d \n’,epochs, mse);
mse = sse/(P*M); % mse = 1/P * 1/M * summed squared error

end