function [] = flopsMx40( Size, Dname, Fname, TitleStr, State )
% flopsMx40 GFlops count by use of matrix multiplication

%% GFlops benchmark based on multiplication of two matrices.
% (C) Torben Larsen, Aalborg University, 24-JUL-2010
%     E-mail: tl.jacket@es.aau.dk
%     http://wiki.accelereyes.com/wiki/index.php/Torben%27s_Corner
%

% Minimum execution time for the individual benchmark point
Tmin = 1;

% Max. number of repetitions in loop timing estimation
MaxAvg = 1E9;


%% INITIALIZE VECTORS
Mem_MB = zeros(length(Size),1);
Fname = ['flopsMx40_' Fname];


%% PREALLOCATE ARRAYS ETC. - IF "RESUME" IS USED THEN LOAD DATA
% If State==RESUME, data is loaded from the existing file for the given
% benchmark and continued from where it came to.
if strcmp(State,'RESUME')
    load([ Dname '/' Fname '.mat']);
    ii = length(find(GFlops_cpu>0));
    SizeN = Size(ii+1:end);
else
    GFlops_cpu = zeros(length(Size),1);
    GFlops_gpu = zeros(length(Size),1);
    T_CPU = zeros(length(Size),1);
    T_CPU_tot = zeros(length(Size),1);
    T_GPU = zeros(length(Size),1);
    T_GPU_tot = zeros(length(Size),1);
    ii = 0;
    SizeN = Size;
end
    

%% PERFORM ANALYSIS
for N=SizeN
    ii = ii + 1;
    
    % Define matrices
    Ac01 = randn(N,N,'single');
    Ac02 = randn(N,N,'single');
    Ac03 = randn(N,N,'single');
    Ac04 = randn(N,N,'single');
    Ac05 = randn(N,N,'single');
    Ac06 = randn(N,N,'single');
    Ac07 = randn(N,N,'single');
    Ac08 = randn(N,N,'single');
    Ac09 = randn(N,N,'single');
    Ac10 = randn(N,N,'single');
    Ac11 = randn(N,N,'single');
    Ac12 = randn(N,N,'single');
    Ac13 = randn(N,N,'single');
    Ac14 = randn(N,N,'single');
    Ac15 = randn(N,N,'single');
    Ac16 = randn(N,N,'single');
    Ac17 = randn(N,N,'single');
    Ac18 = randn(N,N,'single');
    Ac19 = randn(N,N,'single');
    Ac20 = randn(N,N,'single');
    Ac21 = randn(N,N,'single');
    Ac22 = randn(N,N,'single');
    Ac23 = randn(N,N,'single');
    Ac24 = randn(N,N,'single');
    Ac25 = randn(N,N,'single');
    Ac26 = randn(N,N,'single');
    Ac27 = randn(N,N,'single');
    Ac28 = randn(N,N,'single');
    Ac29 = randn(N,N,'single');
    Ac30 = randn(N,N,'single');
    Ac31 = randn(N,N,'single');
    Ac32 = randn(N,N,'single');
    Ac33 = randn(N,N,'single');
    Ac34 = randn(N,N,'single');
    Ac35 = randn(N,N,'single');
    Ac36 = randn(N,N,'single');
    Ac37 = randn(N,N,'single');
    Ac38 = randn(N,N,'single');
    Ac39 = randn(N,N,'single');
    Ac40 = randn(N,N,'single');

    
    % Print matrix size
    fprintf('%4.0f / %4.0f', N, max(Size));
    
    % CPU test begin --------------------------------------------------
    whilecount = 0;
    Telap_cpu = -1;
    while Telap_cpu < Tmin
        whilecount = whilecount + 1;
        if Telap_cpu == -1
            t1 = tic;
            Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05 * Ac06 * Ac07 ...
                 * Ac08 * Ac09 * Ac10 * Ac11 * Ac12 * Ac13 * Ac14 ...
                 * Ac15 * Ac16 * Ac17 * Ac18 * Ac19 * Ac20 ...
                 * Ac21 * Ac22 * Ac23 * Ac24 * Ac25 * Ac26 * Ac27 ...
                 * Ac28 * Ac29 * Ac30 * Ac31 * Ac32 * Ac33 * Ac34 ...
                 * Ac35 * Ac36 * Ac37 * Ac38 * Ac39 * Ac40; % HERE %%%
            Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05 * Ac06 * Ac07 ...
                 * Ac08 * Ac09 * Ac10 * Ac11 * Ac12 * Ac13 * Ac14 ...
                 * Ac15 * Ac16 * Ac17 * Ac18 * Ac19 * Ac20 ...
                 * Ac21 * Ac22 * Ac23 * Ac24 * Ac25 * Ac26 * Ac27 ...
                 * Ac28 * Ac29 * Ac30 * Ac31 * Ac32 * Ac33 * Ac34 ...
                 * Ac35 * Ac36 * Ac37 * Ac38 * Ac39 * Ac40; % HERE %%%
            Telap_cpu = toc(t1)/2;
            NoRunsCPU = ceil(1.5*Tmin/Telap_cpu);
        else
            NoRunsCPU = ceil(1.5*whilecount*NoRunsCPU/Telap_cpu*Tmin);
        end
        
        % Warm-up        
        for no=1:NoRunsCPU
            Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05 * Ac06 * Ac07 ...
                 * Ac08 * Ac09 * Ac10 * Ac11 * Ac12 * Ac13 * Ac14 ...
                 * Ac15 * Ac16 * Ac17 * Ac18 * Ac19 * Ac20 ...
                 * Ac21 * Ac22 * Ac23 * Ac24 * Ac25 * Ac26 * Ac27 ...
                 * Ac28 * Ac29 * Ac30 * Ac31 * Ac32 * Ac33 * Ac34 ...
                 * Ac35 * Ac36 * Ac37 * Ac38 * Ac39 * Ac40; % HERE %%%
        end
        
        % Benchmark
        tstart1 = tic;
        for no=1:NoRunsCPU
            Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05 * Ac06 * Ac07 ...
                 * Ac08 * Ac09 * Ac10 * Ac11 * Ac12 * Ac13 * Ac14 ...
                 * Ac15 * Ac16 * Ac17 * Ac18 * Ac19 * Ac20 ...
                 * Ac21 * Ac22 * Ac23 * Ac24 * Ac25 * Ac26 * Ac27 ...
                 * Ac28 * Ac29 * Ac30 * Ac31 * Ac32 * Ac33 * Ac34 ...
                 * Ac35 * Ac36 * Ac37 * Ac38 * Ac39 * Ac40; % HERE %%%
        end
        Telap_cpu = toc(tstart1);
    end
   
    % Determine time for CPU loop alone
    RPT = min(5E3,ceil(MaxAvg/NoRunsCPU));
    tstart = tic;
    for AvgNo=1:RPT
        for no=1:NoRunsCPU
        end
    end
    T_CPU_Loop = toc(tstart)/RPT;

    % Compute CPU times
    T_CPU(ii) = max((Telap_cpu-T_CPU_Loop)/NoRunsCPU,2.5E-10);
    T_CPU_tot(ii) = Telap_cpu;
    fprintf('  |  T_CPU: %6.1f,', T_CPU_tot(ii));
    GFlops_cpu(ii) = (39*N^2*(2*N-1))/(T_CPU(ii)*1E9);
    fprintf('   %7.1f [GFlops]', GFlops_cpu(ii));
    % CPU test end   --------------------------------------------------
        
    % GPU test begin --------------------------------------------------
    Ag01 = gsingle(Ac01);
    Ag02 = gsingle(Ac02);
    Ag03 = gsingle(Ac03);
    Ag04 = gsingle(Ac04);
    Ag05 = gsingle(Ac05);
    Ag06 = gsingle(Ac06);
    Ag07 = gsingle(Ac07);
    Ag08 = gsingle(Ac08);
    Ag09 = gsingle(Ac09);
    Ag10 = gsingle(Ac10);
    Ag11 = gsingle(Ac11);
    Ag12 = gsingle(Ac12);
    Ag13 = gsingle(Ac13);
    Ag14 = gsingle(Ac14);
    Ag15 = gsingle(Ac15);
    Ag16 = gsingle(Ac16);
    Ag17 = gsingle(Ac17);
    Ag18 = gsingle(Ac18);
    Ag19 = gsingle(Ac19);
    Ag20 = gsingle(Ac20);
    Ag21 = gsingle(Ac01);
    Ag22 = gsingle(Ac02);
    Ag23 = gsingle(Ac03);
    Ag24 = gsingle(Ac04);
    Ag25 = gsingle(Ac05);
    Ag26 = gsingle(Ac06);
    Ag27 = gsingle(Ac07);
    Ag28 = gsingle(Ac08);
    Ag29 = gsingle(Ac09);
    Ag30 = gsingle(Ac10);
    Ag31 = gsingle(Ac11);
    Ag32 = gsingle(Ac12);
    Ag33 = gsingle(Ac13);
    Ag34 = gsingle(Ac14);
    Ag35 = gsingle(Ac15);
    Ag36 = gsingle(Ac16);
    Ag37 = gsingle(Ac17);
    Ag38 = gsingle(Ac18);
    Ag39 = gsingle(Ac19);
    Ag40 = gsingle(Ac20);
    geval(Ag01, Ag02, Ag03, Ag04, Ag05, Ag06, Ag07, Ag08, Ag09, Ag10, ...
          Ag11, Ag12, Ag13, Ag14, Ag15, Ag16, Ag17, Ag18, Ag19, Ag20);
    geval(Ag21, Ag22, Ag23, Ag24, Ag25, Ag26, Ag27, Ag28, Ag29, Ag20, ...
          Ag31, Ag32, Ag33, Ag34, Ag35, Ag36, Ag37, Ag38, Ag39, Ag40);
    
    whilecount = 0;
    Telap_gpu = -1;
    while Telap_gpu < Tmin
        whilecount = whilecount + 1;
        if Telap_gpu == -1
            gsync;
            t1 = tic;
            Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05 * Ag06 * Ag07 ...
                 * Ag08 * Ag09 * Ag10 * Ag11 * Ag12 * Ag13 * Ag14 ...
                 * Ag15 * Ag16 * Ag17 * Ag18 * Ag19 * Ag20 ...
                 * Ag21 * Ag22 * Ag23 * Ag24 * Ag25 * Ag26 * Ag27 ...
                 * Ag28 * Ag29 * Ag30 * Ag31 * Ag32 * Ag33 * Ag34 ...
                 * Ag35 * Ag36 * Ag37 * Ag38 * Ag39 * Ag40; % HERE %%%
            geval(Rg);
            Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05 * Ag06 * Ag07 ...
                 * Ag08 * Ag09 * Ag10 * Ag11 * Ag12 * Ag13 * Ag14 ...
                 * Ag15 * Ag16 * Ag17 * Ag18 * Ag19 * Ag20 ...
                 * Ag21 * Ag22 * Ag23 * Ag24 * Ag25 * Ag26 * Ag27 ...
                 * Ag28 * Ag29 * Ag30 * Ag31 * Ag32 * Ag33 * Ag34 ...
                 * Ag35 * Ag36 * Ag37 * Ag38 * Ag39 * Ag40; % HERE %%%
            geval(Rg);
            gsync;
            Telap_gpu = toc(t1)/2;
            NoRunsGPU = ceil(1.5*Tmin/Telap_gpu);
        else
            NoRunsGPU = ceil(1.5*whilecount*NoRunsGPU/Telap_gpu*Tmin);
        end
        
        % Warm-up
        gsync;
        for no=1:NoRunsGPU
            Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05 * Ag06 * Ag07 ...
                 * Ag08 * Ag09 * Ag10 * Ag11 * Ag12 * Ag13 * Ag14 ...
                 * Ag15 * Ag16 * Ag17 * Ag18 * Ag19 * Ag20 ...
                 * Ag21 * Ag22 * Ag23 * Ag24 * Ag25 * Ag26 * Ag27 ...
                 * Ag28 * Ag29 * Ag30 * Ag31 * Ag32 * Ag33 * Ag34 ...
                 * Ag35 * Ag36 * Ag37 * Ag38 * Ag39 * Ag40; % HERE %%%
            geval(Rg);
        end
        
        % Benchmark
        gsync;
        tstart1 = tic;
        for no=1:NoRunsGPU
            Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05 * Ag06 * Ag07 ...
                 * Ag08 * Ag09 * Ag10 * Ag11 * Ag12 * Ag13 * Ag14 ...
                 * Ag15 * Ag16 * Ag17 * Ag18 * Ag19 * Ag20 ...
                 * Ag21 * Ag22 * Ag23 * Ag24 * Ag25 * Ag26 * Ag27 ...
                 * Ag28 * Ag29 * Ag30 * Ag31 * Ag32 * Ag33 * Ag34 ...
                 * Ag35 * Ag36 * Ag37 * Ag38 * Ag39 * Ag40; % HERE %%%
            geval(Rg);
        end
        gsync;
        Telap_gpu = toc(tstart1);
    end
        
    % Determine time for GPU loop alone
    RPT = min(5E3,ceil(MaxAvg/NoRunsGPU));
    tstart = tic;
    for AvgNo=1:RPT
        for no=1:NoRunsGPU
        end
    end
    T_GPU_Loop = toc(tstart)/RPT;

    % Compute GPU times
    T_GPU(ii) = max((Telap_gpu-T_GPU_Loop)/NoRunsGPU,2.5E-10);
    T_GPU_tot(ii) = Telap_gpu;
    fprintf('   |   T_GPU: %6.1f,', T_GPU_tot(ii));
    GFlops_gpu(ii) = (39*N^2*(2*N-1))/(T_GPU(ii)*1E9);
    fprintf('   %7.1f [GFlops]', GFlops_gpu(ii));
    gpu_info = gpu_entry(13);
    Mem_MB(ii) = gpu_info.gpu_free/1E6;
    clear gpu_hook;
    fprintf('   |   Mem free [MB]:  %6.1f', Mem_MB(ii));
    % GPU test end   --------------------------------------------------
    
    % Print *** as a warning for simulation time violation
    % (should not be possible unless something spookey is going on)
    if T_CPU_tot(ii)>=Tmin && T_GPU_tot(ii)>=Tmin
        fprintf('\n');
    else
        fprintf('  ***\n');
    end
    
    % Save data and plot for every 10 data points
    if ii/10==floor(ii/10)
        save([ Dname '/' Fname '.mat'], 'Size', ...
             'T_CPU', 'T_CPU_tot', 'GFlops_cpu', ...
             'T_GPU', 'T_GPU_tot', 'GFlops_gpu');
        
        figure(1); clf(1);
        plot((39*Size(1:ii).^2.*(2*Size(1:ii)-1))/1E9, GFlops_cpu(1:ii), 'r-', ...
             (39*Size(1:ii).^2.*(2*Size(1:ii)-1))/1E9, GFlops_gpu(1:ii), 'g-', ...
             'Linewidth',1.5);
        grid;
        xlabel('Complexity   [GFlop]');
        ylabel('Performance   [GFlops]');
        legend('CPU', 'GPU', 'Location', 'SouthEast');
        title(['Mx40: ' TitleStr]);
        
        % Save figure
        print( gcf, '-djpeg99', '-r100', [ Dname '/' Fname '.jpg'] );
        print( gcf, '-depsc2', '-r2400', [ Dname '/' Fname '.eps'] );
    end
end

end
