close all
clear all
nubar_low = 100;
nubar_high = 10000;
nubar_start = 2450;
nubar_end = 3350;
p_density = 100;
eval_sub_density = 300;
nu_bar = nubar_low:1/p_density:nubar_high;
len_nubar = length(nu_bar);
nu_bar_eval = nubar_start:1/(p_density/eval_sub_density):nubar_end;
len_eval=length(nu_bar_eval);
bb = (nu_bar >= nubar_start) & (nu_bar <= nubar_end);
ind_start = find(bb,1,'first');
ind_end = find(bb,1,'last');
eval_ind = ind_start:eval_sub_density:ind_end;
fprintf('%i points per integral; %i integrals\r',len_nubar, len_eval);
n_inf = 0;
K = zeros(size(nu_bar));
k_max = 0.01;
nu_bar_0 = 2800;
gamma = 50;
K = k_max * (gamma/2)^2 * ( ((nu_bar-nu_bar_0).^2 + (gamma/2)^2).^-1 - ((nu_bar+nu_bar_0).^2 + (gamma/2)^2).^-1);
dK = zeros(size(K));
dK(2:end-1) = (K(3:end)-K(1:end-2))./(nu_bar(3:end)-nu_bar(1:end-2));
dK(1) = (K(2)-K(1))./(nu_bar(2)-nu_bar(1));
dK(end) = (K(end)-K(end-1))./(nu_bar(end)-nu_bar(end-1));
dN_KK = zeros(1,len_eval);
fprintf('\rCPU for unoptimized\r');
tic
for i = 1:len_eval
jj = eval_ind(i);
dN_KK(i) = sum(nu_bar([1:jj-1, i+1:end]) .* K([1:jj-1, i+1:end]) ./ (nu_bar([1:jj-1, i+1:end]).^2 - nu_bar(jj).^2));
dN_KK(i) = 2*dN_KK(i) + K(jj)./(2*nu_bar(jj)) + dK(jj);
end
dN_KK = 2*dN_KK + K(eval_ind)./(2*nu_bar_eval) + dK(eval_ind);
dN_KK = (1/(pi*p_density))*dN_KK;
N_KK = dN_KK + n_inf;
toc
figure
plot(nu_bar_eval,dN_KK);
hold on
fprintf('\rCPU for 1D vector optimized\r');
tic
part_a = nu_bar .* K;
part_b = nu_bar .^2;
for i = 1:len_eval
jj = eval_ind(i);
part_c = part_a ./ (part_b - part_b(jj));
part_c(jj) = 0;
dN_KK(i) = sum(part_c);
end
dN_KK = 2*dN_KK + K(eval_ind)./(2*nu_bar_eval) + dK(eval_ind);
dN_KK = (1/(pi*p_density))*dN_KK;
N_KK = dN_KK + n_inf;
toc
plot(nu_bar_eval,dN_KK);
hold on
fprintf('\rCPU 2D vectorized\r');
mem = memory;
mem_avail = mem.MemAvailableAllArrays;
mem_need = len_nubar*len_eval*8*4;
if mem_need > mem_avail
fprintf('not enough memory, need %10.3e, only %10.3e available\r',mem_need, mem_avail);
proceed = false;
else
fprintf('have enough memory, need %10.3e, %10.3e available\r',mem_need, mem_avail);
proceed = true;
end
if proceed
tic
R_temp = repmat(nu_bar.^2,len_eval,1);
R_nubar_K = repmat(nu_bar.*K,len_eval,1);
R_nu_bar_eval_sq = repmat(nu_bar_eval'.^2,1,len_nubar);
R_temp = (R_nubar_K) ./ (R_temp - R_nu_bar_eval_sq);
for i = 1:len_eval
R_temp(i,eval_ind(i)) = 0;
end
dN_KK = sum(R_temp,2)';
dN_KK = 2*dN_KK + K(eval_ind)./(2*nu_bar_eval) + dK(eval_ind);
dN_KK = (1/(pi*p_density))*dN_KK;
N_KK = dN_KK + n_inf;
toc
clear R_temp R_nubar_K R_nu_bar_eval_sq;
plot(nu_bar_eval,dN_KK);
hold on
end
fprintf('\rCPU for 1D vector optimized single precision\r');
tic
snu_bar = single(nu_bar);
sK = single(K);
sdK = single(dK);
snu_bar_eval = single(nu_bar_eval);
seval_ind = single(eval_ind);
sdN_KK = single(dN_KK);
spart_a = snu_bar .* sK;
spart_b = snu_bar .^2;
for i = 1:len_eval
jj = seval_ind(i);
spart_c = spart_a ./ (spart_b - spart_b(jj));
spart_c(jj) = 0;
sdN_KK(i) = sum(spart_c);
end
dN_KK = double(2*sdN_KK + sK(seval_ind)./(2*snu_bar_eval) + sdK(seval_ind));
dN_KK = (1/(pi*p_density))*dN_KK;
N_KK = dN_KK + n_inf;
toc
plot(nu_bar_eval,dN_KK);
hold on
fprintf('\rCPU 2D vectorized single precision\r');
mem = memory;
mem_avail = mem.MemAvailableAllArrays;
mem_need = len_nubar*len_eval*4*4;
if mem_need > mem_avail
fprintf('not enough memory, need %10.3e, only %10.3e available\r',mem_need, mem_avail);
proceed = false;
else
fprintf('have enough memory, need %10.3e, %10.3e available\r',mem_need, mem_avail);
proceed = true;
end
if proceed
tic
sR_temp = repmat(single(nu_bar).^2,len_eval,1);
sR_nubar_K = repmat(single(nu_bar).*K,len_eval,1);
sR_nu_bar_eval_sq = repmat(single(nu_bar_eval)'.^2,1,len_nubar);
seval_ind = single(eval_ind);
sR_temp = (sR_nubar_K) ./ (sR_temp - sR_nu_bar_eval_sq);
for i = 1:len_eval
sR_temp(i,seval_ind(i)) = 0;
end
dN_KK = double(sum(sR_temp,2)');
dN_KK = 2*dN_KK + K(eval_ind)./(2*nu_bar_eval) + dK(eval_ind);
dN_KK = (1/(pi*p_density))*dN_KK;
N_KK = dN_KK + n_inf;
toc
clear sR_temp sR_nubar_K sR_nu_bar_eval_sq;
plot(nu_bar_eval,dN_KK);
hold on
end
fprintf('\rCPU parfor unoptimized\r');
poolobj = gcp;
tic
parfor i = 1:len_eval
jj = eval_ind(i);
dN_KK(i) = sum(nu_bar([1:jj-1, i+1:end]) .* K([1:jj-1, i+1:end]) ./ (nu_bar([1:jj-1, i+1:end]).^2 - nu_bar(jj).^2));
dN_KK(i) = 2*dN_KK(i) + K(jj)./(2*nu_bar(jj)) + dK(jj);
end
dN_KK = 2*dN_KK + K(eval_ind)./(2*nu_bar_eval) + dK(eval_ind);
dN_KK = (1/(pi*p_density))*dN_KK;
N_KK = dN_KK + n_inf;
toc
plot(nu_bar_eval,dN_KK);
hold on
fprintf('\rCPU parfor 1D vector optimized\r');
poolobj = gcp;
tic
part_a = nu_bar .* K;
part_b = nu_bar .^2;
parfor i = 1:len_eval
jj = eval_ind(i);
part_c = part_a ./ (part_b - part_b(jj));
part_c(jj) = 0;
dN_KK(i) = sum(part_c);
end
dN_KK = 2*dN_KK + K(eval_ind)./(2*nu_bar_eval) + dK(eval_ind);
dN_KK = (1/(pi*p_density))*dN_KK;
N_KK = dN_KK + n_inf;
toc
plot(nu_bar_eval,dN_KK);
hold on
try
canUseGPU = parallel.gpu.GPUDevice.isAvailable;
catch ME
canUseGPU = false;
end
if canUseGPU
fprintf('\rGPU for un-optimized\r');
gGPU = gpuDevice(1);
reset(gGPU);
tic
gnu_bar = gpuArray(nu_bar);
gK = gpuArray(K);
gdK = gpuArray(dK);
gdN_KK = gpuArray(dN_KK);
for i = 1:len_eval
jj = eval_ind(i);
gdN_KK(i) = sum(gnu_bar([1:jj-1, jj+1:end]) .* gK([1:jj-1, jj+1:end]) ./ (gnu_bar([1:jj-1, jj+1:end]).^2 - gnu_bar(jj).^2));
gdN_KK(i) = 2*gdN_KK(i) + gK(jj)./(2*gnu_bar(jj)) + gdK(jj);
end
gdN_KK = (1/(pi*p_density))*gdN_KK;
dN_KK =gather(gdN_KK);
wait(gGPU);
toc
plot(nu_bar_eval,dN_KK);
hold on
fprintf('\rGPU for 1D vector optimized\r');
gGPU = gpuDevice(1);
reset(gGPU);
tic
gnu_bar = gpuArray(nu_bar);
gK = gpuArray(K);
gdK = gpuArray(dK);
gnu_bar_eval = gpuArray(nu_bar_eval);
gdN_KK = gpuArray(dN_KK);
geval_ind = gpuArray(eval_ind);
gpart_a = gnu_bar .* gK;
gpart_b = gnu_bar .^2;
for i = 1:len_eval
jj = geval_ind(i);
gpart_c = gpart_a ./ (gpart_b - gpart_b(jj));
gpart_c(jj) = 0;
gdN_KK(i)= sum(gpart_c);
end
gdN_KK = 2*gdN_KK + gK(geval_ind)./(2*gnu_bar_eval) + gdK(geval_ind);
gdN_KK = (1/(pi*p_density))*gdN_KK;
dN_KK =gather(gdN_KK);
wait(gGPU);
N_KK = dN_KK + n_inf;
toc
plot(nu_bar_eval,dN_KK);
hold on
fprintf('\rGPU for 2D vector optimized\r');
gGPU = gpuDevice(1);
reset(gGPU);
mem_GPU_avail = gGPU.AvailableMemory;
mem_need = len_nubar*len_eval*8*5;
if mem_need > mem_GPU_avail
fprintf('not enough memory, need %10.3e, only %10.3e available\r',mem_need, mem_GPU_avail);
proceed = false;
else
fprintf('have enough memory, need %10.3e, %10.3e available\r',mem_need, mem_GPU_avail);
proceed = true;
end
if proceed
tic
gnu_bar = gpuArray(nu_bar);
gnu_bar_eval = gpuArray(nu_bar_eval);
gK = gpuArray(K);
gdK = gpuArray(dK);
gdN_KK = gpuArray(dN_KK);
geval_ind = gpuArray(eval_ind);
R_gtemp = repmat(gnu_bar.^2,len_eval,1);
R_gnubar_K = repmat(gnu_bar.*gK,len_eval,1);
R_gnu_bar_eval_sq = repmat(gnu_bar_eval'.^2,1,len_nubar);
R_gtemp = (R_gnubar_K) ./ (R_gtemp - R_gnu_bar_eval_sq);
for i = 1:len_eval
R_gtemp(i,eval_ind(i)) = 0;
end
gdN_KK = sum(R_gtemp,2)';
gdN_KK = 2*gdN_KK + gK(eval_ind)./(2*gnu_bar_eval) + gdK(geval_ind);
gdN_KK = (1/(pi*p_density))*gdN_KK;
dN_KK =gather(gdN_KK);
wait(gGPU);
N_KK = dN_KK + n_inf;
toc
plot(nu_bar_eval,dN_KK);
hold on
end
fprintf('\rGPU for un-optimized single precision\r');
gGPU = gpuDevice(1);
reset(gGPU);
tic
gnu_bar = gpuArray(single(nu_bar));
gK = gpuArray(single(K));
gdK = gpuArray(single(dK));
gdN_KK = gpuArray(single(dN_KK));
for i = 1:len_eval
jj = eval_ind(i);
gdN_KK(i) = sum(gnu_bar([1:jj-1, jj+1:end]) .* gK([1:jj-1, jj+1:end]) ./ (gnu_bar([1:jj-1, jj+1:end]).^2 - gnu_bar(jj).^2));
gdN_KK(i) = 2*gdN_KK(i) + gK(jj)./(2*gnu_bar(jj)) + gdK(jj);
end
gdN_KK = (1/(pi*p_density))*gdN_KK;
dN_KK =double(gather(gdN_KK));
wait(gGPU);
toc
plot(nu_bar_eval,dN_KK);
hold on
fprintf('\rGPU for 1D vector optimized single precision\r');
gGPU = gpuDevice(1);
reset(gGPU);
tic
gnu_bar = gpuArray(single(nu_bar));
gK = gpuArray(single(K));
gdK = gpuArray(single(dK));
gdN_KK = gpuArray(single(dN_KK));
gnu_bar_eval = gpuArray(single(nu_bar_eval));
geval_ind = gpuArray(single(eval_ind));
gpart_a = gnu_bar .* gK;
gpart_b = gnu_bar .^2;
for i = 1:len_eval
jj = geval_ind(i);
gpart_c = gpart_a ./ (gpart_b - gpart_b(jj));
gpart_c(jj) = 0;
gdN_KK(i)= sum(gpart_c);
end
gdN_KK = 2*gdN_KK + gK(geval_ind)./(2*gnu_bar_eval) + gdK(geval_ind);
gdN_KK = (1/(pi*p_density))*gdN_KK;
dN_KK =double(gather(gdN_KK));
wait(gGPU);
N_KK = dN_KK + n_inf;
toc
plot(nu_bar_eval,dN_KK);
hold on
fprintf('\rGPU for 2D vector optimized single precision\r');
gGPU = gpuDevice(1);
reset(gGPU);
mem_GPU_avail = gGPU.AvailableMemory;
mem_need = len_nubar*len_eval*4*5;
if mem_need > mem_GPU_avail
fprintf('not enough memory, need %10.3e, only %10.3e available\r',mem_need, mem_GPU_avail);
proceed = false;
else
fprintf('have enough memory, need %10.3e, %10.3e available\r',mem_need, mem_GPU_avail);
proceed = true;
end
if proceed
tic
gnu_bar = gpuArray(single(nu_bar));
gK = gpuArray(single(K));
gdK = gpuArray(single(dK));
gdN_KK = gpuArray(single(dN_KK));
gnu_bar_eval = gpuArray(single(nu_bar_eval));
geval_ind = gpuArray(single(eval_ind));
R_gtemp = repmat(gnu_bar.^2,len_eval,1);
R_gnubar_K = repmat(gnu_bar.*gK,len_eval,1);
R_gnu_bar_eval_sq = repmat(gnu_bar_eval'.^2,1,len_nubar);
R_gtemp = (R_gnubar_K) ./ (R_gtemp - R_gnu_bar_eval_sq);
for i = 1:len_eval
R_gtemp(i,eval_ind(i)) = 0;
end
gdN_KK = sum(R_gtemp,2)';
gdN_KK = 2*gdN_KK + gK(eval_ind)./(2*gnu_bar_eval) + gdK(geval_ind);
gdN_KK = (1/(pi*p_density))*gdN_KK;
dN_KK =double(gather(gdN_KK));
wait(gGPU);
N_KK = dN_KK + n_inf;
toc
plot(nu_bar_eval,dN_KK);
hold on
end
end