// option 1
double sum1(double *A, int n) {
double sum=0.0;
for (i=0; i<n; ++i)
sum += A[i];
return sum;
}
// option 2
double sum2(double *A, int n) {
if (n == 1) return A[0];
for (i=0; i<n/2; ++i)
B[i] = A[2i] + A[2i+1];
return sum2 (B, n/2);
}
best sequential time / time on \(p\) processors
speedup / perfect speedup
Ideally, we want to minimize load-balance, maximize concurrency and have the smallest possible overhead.
\[ T_p = sT + (1-s)\frac{T}{p} \]
\[ S = \frac{1}{s+ (1-s)/p} \rightarrow \frac{1}{s} \]
DAG: directed acyclic graph
a DAG with \(n\) input is a computation with no branching
an algorithm is a family of DAGS (to include branching)
\[ P(n) = \frac{W(n)}{D(n)} \]
parallel for
// option 1
double sum1(double *A, int n) { // W D
double sum=0.0; // 1 1
for (i=0; i<n; ++i) sum += A[i]; // n n
return sum; // 1 1
}
\[ W(n) = 2 + n, \quad D(n) = 2 + n, \quad P(n) = \mathcal{O}(1) \]
// option 2
double sum2(double *A, int n) { // W D
if (n == 1) return A[0]; // 1 1
parallel for (i=0; i<n/2; ++i) // n 1
B[i] = A[2i] + A[2i+1];
return sum2 (B, n/2); } // W(n/2) D(n/2)
\[ W(n) = n + W(n/2), \quad D(n) = 1 + D(n/2), \quad P(n) = \mathcal{O}(?) \]
// initialize u_i
double u_i=0.0, u_ip1, u_im1;
if (i == 0)
u_im1 = alpha;
else (i == n)
u_ip1 = beta;
while (err < tolerance) {
if (i>1) send(u_i, i-1);
if (i<n) send(u_i, i+1);
if (i<n) recv(u_ip1, i+1);
if (i>1) recv(u_im1, i-1);
wait(sends to complete);
err = u_i;
u_i = (u_im1 + u_ip1)/2;
err = err - u_i; err = err*err;
err = sum2(err); // reduce_all
}
\[ A \in \mathbb{C}^{n\times n}, y_i=\sum_{j=0}^{n-1} A_{ij}x_j, \quad i=0,\cdots,n-1 \]
matvec(A, x, y, n) { // W D
parallel for (i=0; i<n; ++i) // n 1
z[i] = 0;
parallel for (j=0; j<n; ++j) // n 1
z[j] = A[i,j] * x[j];
y[i] = sum2 (z); // n log n
}
\(W(n)=n^2\)
\(D(n)=\log n\)
\(P(n) = \frac{n^2}{\log n}\)
consider a divide and conquer approach to sum
sum3(A,n) {
if (n == 1) return A[0];
return sum3(A,n/2) + sum3(A+n/2, n/2);
}
sum2
or sum3
?