function [] = flopsMx5( Size, Dname, Fname, TitleStr, State )
% flopsMx5 GFlops count by use of matrix multiplication

%% GFlops benchmark based on multiplication of two matrices.
% (C) Torben Larsen, Aalborg University, 24-JUL-2010
%     E-mail: tl.jacket@es.aau.dk
%     http://wiki.accelereyes.com/wiki/index.php/Torben%27s_Corner
%

% Minimum execution time for the individual benchmark point
Tmin = 1;

% Max. number of repetitions in loop timing estimation
MaxAvg = 1E9;


%% INITIALIZE VECTORS
Mem_MB = zeros(length(Size),1);
Fname = ['flopsMx5_' Fname];


%% PREALLOCATE ARRAYS ETC. - IF "RESUME" IS USED THEN LOAD DATA
% If State==RESUME, data is loaded from the existing file for the given
% benchmark and continued from where it came to.
if strcmp(State,'RESUME')
    load([ Dname '/' Fname '.mat']);
    ii = length(find(GFlops_cpu>0));
    SizeN = Size(ii+1:end);
else
    GFlops_cpu = zeros(length(Size),1);
    GFlops_gpu = zeros(length(Size),1);
    T_CPU = zeros(length(Size),1);
    T_CPU_tot = zeros(length(Size),1);
    T_GPU = zeros(length(Size),1);
    T_GPU_tot = zeros(length(Size),1);
    ii = 0;
    SizeN = Size;
end
    

%% PERFORM ANALYSIS
for N=SizeN
    ii = ii + 1;
    
    % Define matrices
    Ac01 = randn(N,N,'single');
    Ac02 = randn(N,N,'single');
    Ac03 = randn(N,N,'single');
    Ac04 = randn(N,N,'single');
    Ac05 = randn(N,N,'single');
    
    % Print matrix size
    fprintf('%4.0f / %4.0f', N, max(Size));
    
    % CPU test begin --------------------------------------------------
    whilecount = 0;
    Telap_cpu = -1;
    while Telap_cpu < Tmin
        whilecount = whilecount + 1;
        if Telap_cpu == -1
            t1 = tic;
            Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05; %%%%%%%%% HERE %%%
            Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05; %%%%%%%%% HERE %%%
            Telap_cpu = toc(t1)/2;
            NoRunsCPU = ceil(1.5*Tmin/Telap_cpu);
        else
            NoRunsCPU = ceil(1.5*whilecount*NoRunsCPU/Telap_cpu*Tmin);
        end
        
        % Warm-up        
        for no=1:NoRunsCPU
            Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05;   %%%%%%% HERE %%%
        end
        
        % Benchmark
        tstart1 = tic;
        for no=1:NoRunsCPU
            Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05;   %%%%%%% HERE %%%
        end
        Telap_cpu = toc(tstart1);
    end
   
    % Determine time for CPU loop alone
    RPT = min(5E3,ceil(MaxAvg/NoRunsCPU));
    tstart = tic;
    for AvgNo=1:RPT
        for no=1:NoRunsCPU
        end
    end
    T_CPU_Loop = toc(tstart)/RPT;

    % Compute CPU times
    T_CPU(ii) = max((Telap_cpu-T_CPU_Loop)/NoRunsCPU,2.5E-10);
    T_CPU_tot(ii) = Telap_cpu;
    fprintf('  |  T_CPU: %6.1f,', T_CPU_tot(ii));
    GFlops_cpu(ii) = (4*N^2*(2*N-1))/(T_CPU(ii)*1E9);
    fprintf('   %7.1f [GFlops]', GFlops_cpu(ii));
    % CPU test end   --------------------------------------------------
        
    % GPU test begin --------------------------------------------------
    Ag01 = gsingle(Ac01);
    Ag02 = gsingle(Ac02);
    Ag03 = gsingle(Ac03);
    Ag04 = gsingle(Ac04);
    Ag05 = gsingle(Ac05);
    geval(Ag01, Ag02, Ag03, Ag04, Ag05);
    
    whilecount = 0;
    Telap_gpu = -1;
    while Telap_gpu < Tmin
        whilecount = whilecount + 1;
        if Telap_gpu == -1
            gsync;
            t1 = tic;
            Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05;   %%%%%%% HERE %%%
            geval(Rg);
            Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05;   %%%%%%% HERE %%%
            geval(Rg);
            gsync;
            Telap_gpu = toc(t1)/2;
            NoRunsGPU = ceil(1.5*Tmin/Telap_gpu);
        else
            NoRunsGPU = ceil(1.5*whilecount*NoRunsGPU/Telap_gpu*Tmin);
        end
        
        % Warm-up
        gsync;
        for no=1:NoRunsGPU
            Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05;   %%%%%%% HERE %%%
            geval(Rg);
        end
        
        % Benchmark
        gsync;
        tstart1 = tic;
        for no=1:NoRunsGPU
            Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05;   %%%%%%% HERE %%%
            geval(Rg);
        end
        gsync;
        Telap_gpu = toc(tstart1);
    end
        
    % Determine time for GPU loop alone
    RPT = min(5E3,ceil(MaxAvg/NoRunsGPU));
    tstart = tic;
    for AvgNo=1:RPT
        for no=1:NoRunsGPU
        end
    end
    T_GPU_Loop = toc(tstart)/RPT;

    % Compute GPU times
    T_GPU(ii) = max((Telap_gpu-T_GPU_Loop)/NoRunsGPU,2.5E-10);
    T_GPU_tot(ii) = Telap_gpu;
    fprintf('   |   T_GPU: %6.1f,', T_GPU_tot(ii));
    GFlops_gpu(ii) = (4*N^2*(2*N-1))/(T_GPU(ii)*1E9);
    fprintf('   %7.1f [GFlops]', GFlops_gpu(ii));
    gpu_info = gpu_entry(13);
    Mem_MB(ii) = gpu_info.gpu_free/1E6;
    clear gpu_hook;
    fprintf('   |   Mem free [MB]:  %6.1f', Mem_MB(ii));
    % GPU test end   --------------------------------------------------
    
    % Print *** as a warning for simulation time violation
    % (should not be possible unless something spookey is going on)
    if T_CPU_tot(ii)>=Tmin && T_GPU_tot(ii)>=Tmin
        fprintf('\n');
    else
        fprintf('  ***\n');
    end
    
    % Save data and plot for every 10 data points
    if ii/10==floor(ii/10)
        save([ Dname '/' Fname '.mat'], 'Size', ...
             'T_CPU', 'T_CPU_tot', 'GFlops_cpu', ...
             'T_GPU', 'T_GPU_tot', 'GFlops_gpu');
        
        figure(1); clf(1);
        plot((4*Size(1:ii).^2.*(2*Size(1:ii)-1))/1E9, GFlops_cpu(1:ii), 'r-', ...
             (4*Size(1:ii).^2.*(2*Size(1:ii)-1))/1E9, GFlops_gpu(1:ii), 'g-', ...
             'Linewidth',1.5);
        grid;
        xlabel('Complexity   [GFlop]');
        ylabel('Performance   [GFlops]');
        legend('CPU', 'GPU', 'Location', 'SouthEast');
        title(['Mx5: ' TitleStr]);
        
        % Save figure
        print( gcf, '-djpeg99', '-r100', [ Dname '/' Fname '.jpg'] );
        print( gcf, '-depsc2', '-r2400', [ Dname '/' Fname '.eps'] );
    end
end

end