1 | /*
|
---|
2 | * $Id: eval.cc,v 1.1.1.1 1999-04-09 17:59:03 ansari Exp $
|
---|
3 | *
|
---|
4 | * $Log: not supported by cvs2svn $
|
---|
5 | * Revision 1.2 1998/03/14 00:04:47 tveldhui
|
---|
6 | * 0.2-alpha-05
|
---|
7 | *
|
---|
8 | * Revision 1.1 1998/02/25 20:04:01 tveldhui
|
---|
9 | * Initial revision
|
---|
10 | *
|
---|
11 | */
|
---|
12 |
|
---|
13 | #ifndef BZ_ARRAYEVAL_CC
|
---|
14 | #define BZ_ARRAYEVAL_CC
|
---|
15 |
|
---|
16 | #ifndef BZ_ARRAY_H
|
---|
17 | #error <blitz/array/eval.cc> must be included via <blitz/array.h>
|
---|
18 | #endif
|
---|
19 |
|
---|
20 | BZ_NAMESPACE(blitz)
|
---|
21 |
|
---|
22 | /*
|
---|
23 | * Assign an expression to an array. For performance reasons, there are
|
---|
24 | * several traversal mechanisms:
|
---|
25 | *
|
---|
26 | * - Index traversal scans through the destination array in storage order.
|
---|
27 | * The expression is evaluated using a TinyVector<int,N> operand. This
|
---|
28 | * version is used only when there are index placeholders in the expression
|
---|
29 | * (see <blitz/indexexpr.h>)
|
---|
30 | * - Stack traversal also scans through the destination array in storage
|
---|
31 | * order. However, push/pop stack iterators are used.
|
---|
32 | * - Fast traversal follows a Hilbert (or other) space-filling curve to
|
---|
33 | * improve cache reuse for stencilling operations. Currently, the
|
---|
34 | * space filling curves must be generated by calling
|
---|
35 | * generateFastTraversalOrder(TinyVector<int,N_dimensions>).
|
---|
36 | * - 2D tiled traversal follows a tiled traversal, to improve cache reuse
|
---|
37 | * for 2D stencils. Space filling curves have too much overhead to use
|
---|
38 | * in two-dimensions.
|
---|
39 | *
|
---|
40 | * _bz_tryFastTraversal is a helper class. Fast traversals are only
|
---|
41 | * attempted if the expression looks like a stencil -- it's at least
|
---|
42 | * three-dimensional, has at least six array operands, and there are
|
---|
43 | * no index placeholders in the expression. These are all things which
|
---|
44 | * can be checked at compile time, so the if()/else() syntax has been
|
---|
45 | * replaced with this class template.
|
---|
46 | */
|
---|
47 |
|
---|
48 | // Fast traversals require <set> from the ISO/ANSI C++ standard library
|
---|
49 | #ifdef BZ_HAVE_STD
|
---|
50 |
|
---|
51 | template<_bz_bool canTryFastTraversal>
|
---|
52 | struct _bz_tryFastTraversal {
|
---|
53 | template<class T_numtype, int N_rank, class T_expr, class T_update>
|
---|
54 | static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
|
---|
55 | _bz_ArrayExpr<T_expr> expr, T_update)
|
---|
56 | {
|
---|
57 | return _bz_false;
|
---|
58 | }
|
---|
59 | };
|
---|
60 |
|
---|
61 | template<>
|
---|
62 | struct _bz_tryFastTraversal<_bz_true> {
|
---|
63 | template<class T_numtype, int N_rank, class T_expr, class T_update>
|
---|
64 | static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
|
---|
65 | _bz_ArrayExpr<T_expr> expr, T_update)
|
---|
66 | {
|
---|
67 | // See if there's an appropriate space filling curve available.
|
---|
68 | // Currently fast traversals use an N-1 dimensional curve. The
|
---|
69 | // Nth dimension column corresponding to each point on the curve
|
---|
70 | // is traversed in the normal fashion.
|
---|
71 | TraversalOrderCollection<N_rank-1> traversals;
|
---|
72 | TinyVector<int, N_rank - 1> traversalGridSize;
|
---|
73 |
|
---|
74 | for (int i=0; i < N_rank - 1; ++i)
|
---|
75 | traversalGridSize[i] = array.length(array.ordering(i+1));
|
---|
76 |
|
---|
77 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
78 | cout << "traversalGridSize = " << traversalGridSize << endl;
|
---|
79 | cout.flush();
|
---|
80 | #endif
|
---|
81 |
|
---|
82 | const TraversalOrder<N_rank-1>* order =
|
---|
83 | traversals.find(traversalGridSize);
|
---|
84 |
|
---|
85 | if (order)
|
---|
86 | {
|
---|
87 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
88 | cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
|
---|
89 | << ", " << N_rank << ">: Using stack traversal" << endl;
|
---|
90 | #endif
|
---|
91 | // A curve was available -- use fast traversal.
|
---|
92 | array.evaluateWithFastTraversal(*order, expr, T_update());
|
---|
93 | return _bz_true;
|
---|
94 | }
|
---|
95 |
|
---|
96 | return _bz_false;
|
---|
97 | }
|
---|
98 | };
|
---|
99 |
|
---|
100 | #endif // BZ_HAVE_STD
|
---|
101 |
|
---|
102 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
103 | inline Array<T_numtype, N_rank>&
|
---|
104 | Array<T_numtype, N_rank>::evaluate(_bz_ArrayExpr<T_expr> expr, T_update)
|
---|
105 | {
|
---|
106 | // Check that all arrays have the same shape
|
---|
107 | #ifdef BZ_DEBUG
|
---|
108 | if (!expr.shapeCheck(shape()))
|
---|
109 | {
|
---|
110 | if (assertFailMode == _bz_false)
|
---|
111 | {
|
---|
112 | cerr << "[Blitz++] Shape check failed: Module " << __FILE__
|
---|
113 | << " line " << __LINE__ << endl
|
---|
114 | << " Expression: ";
|
---|
115 | prettyPrintFormat format(_bz_true); // Use terse formatting
|
---|
116 | string str;
|
---|
117 | expr.prettyPrint(str, format);
|
---|
118 | cerr << str << endl ;
|
---|
119 | }
|
---|
120 |
|
---|
121 | #if 0
|
---|
122 | // Shape dumping is broken by change to using string for prettyPrint
|
---|
123 | << " Shapes: " << shape() << " = ";
|
---|
124 | prettyPrintFormat format2;
|
---|
125 | format2.setDumpArrayShapesMode();
|
---|
126 | expr.prettyPrint(cerr, format2);
|
---|
127 | cerr << endl;
|
---|
128 | #endif
|
---|
129 | BZ_PRE_FAIL;
|
---|
130 | }
|
---|
131 | #endif
|
---|
132 |
|
---|
133 | BZPRECHECK(expr.shapeCheck(shape()),
|
---|
134 | "Shape check failed." << endl << "Expression:");
|
---|
135 |
|
---|
136 | BZPRECHECK((T_expr::rank == N_rank) || (T_expr::numArrayOperands == 0),
|
---|
137 | "Assigned rank " << T_expr::rank << " expression to rank "
|
---|
138 | << N_rank << " array.");
|
---|
139 |
|
---|
140 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
141 | cout << "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders
|
---|
142 | << endl; cout.flush();
|
---|
143 | #endif
|
---|
144 |
|
---|
145 | // Tau profiling code. Provide Tau with a pretty-printed version of
|
---|
146 | // the expression.
|
---|
147 | // NEEDS_WORK-- use a static initializer somehow.
|
---|
148 |
|
---|
149 | #ifdef BZ_TAU_PROFILING
|
---|
150 | static string exprDescription;
|
---|
151 | if (!exprDescription.length()) // faked static initializer
|
---|
152 | {
|
---|
153 | exprDescription = "A";
|
---|
154 | prettyPrintFormat format(_bz_true); // Terse mode on
|
---|
155 | format.nextArrayOperandSymbol();
|
---|
156 | T_update::prettyPrint(exprDescription);
|
---|
157 | expr.prettyPrint(exprDescription, format);
|
---|
158 | }
|
---|
159 | TAU_PROFILE(" ", exprDescription, TAU_BLITZ);
|
---|
160 | #endif
|
---|
161 |
|
---|
162 | // Determine which evaluation mechanism to use
|
---|
163 | if (T_expr::numIndexPlaceholders > 0)
|
---|
164 | {
|
---|
165 | // The expression involves index placeholders, so have to
|
---|
166 | // use index traversal rather than stack traversal.
|
---|
167 |
|
---|
168 | if (N_rank == 1)
|
---|
169 | return evaluateWithIndexTraversal1(expr, T_update());
|
---|
170 | else
|
---|
171 | return evaluateWithIndexTraversalN(expr, T_update());
|
---|
172 | }
|
---|
173 | else {
|
---|
174 |
|
---|
175 | // If this expression looks like an array stencil, then attempt to
|
---|
176 | // use a fast traversal order.
|
---|
177 | // Fast traversals require <set> from the ISO/ANSI C++ standard
|
---|
178 | // library.
|
---|
179 |
|
---|
180 | #ifdef BZ_HAVE_STD
|
---|
181 |
|
---|
182 | enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)
|
---|
183 | && (T_expr::numIndexPlaceholders == 0) };
|
---|
184 |
|
---|
185 | if (_bz_tryFastTraversal<isStencil>::tryFast(*this, expr, T_update()))
|
---|
186 | return *this;
|
---|
187 |
|
---|
188 | #endif
|
---|
189 |
|
---|
190 | #ifdef BZ_ARRAY_2D_STENCIL_TILING
|
---|
191 | // Does this look like a 2-dimensional stencil on a largeish
|
---|
192 | // array?
|
---|
193 |
|
---|
194 | if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))
|
---|
195 | {
|
---|
196 | // Use a heuristic to determine whether a tiled traversal
|
---|
197 | // is desirable. First, estimate how much L1 cache is needed
|
---|
198 | // to achieve a high hit rate using the stack traversal.
|
---|
199 | // Try to err on the side of using tiled traversal even when
|
---|
200 | // it isn't strictly needed.
|
---|
201 |
|
---|
202 | // Assumptions:
|
---|
203 | // Stencil width 3
|
---|
204 | // 3 arrays involved in stencil
|
---|
205 | // Uniform data type in arrays (all T_numtype)
|
---|
206 |
|
---|
207 | int cacheNeeded = 3 * 3 * sizeof(T_numtype) * length(ordering(0));
|
---|
208 | if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE)
|
---|
209 | return evaluateWithTiled2DTraversal(expr, T_update());
|
---|
210 | }
|
---|
211 |
|
---|
212 | #endif
|
---|
213 |
|
---|
214 | // If fast traversal isn't available or appropriate, then just
|
---|
215 | // do a stack traversal.
|
---|
216 | if (N_rank == 1)
|
---|
217 | return evaluateWithStackTraversal1(expr, T_update());
|
---|
218 | else
|
---|
219 | return evaluateWithStackTraversalN(expr, T_update());
|
---|
220 | }
|
---|
221 | }
|
---|
222 |
|
---|
223 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
224 | inline Array<T_numtype, N_rank>&
|
---|
225 | Array<T_numtype, N_rank>::evaluateWithStackTraversal1(_bz_ArrayExpr<T_expr>
|
---|
226 | expr, T_update)
|
---|
227 | {
|
---|
228 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
229 | BZ_DEBUG_MESSAGE("Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
|
---|
230 | << ", " << N_rank << ">: Using stack traversal");
|
---|
231 | #endif
|
---|
232 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
233 | iter.loadStride(firstRank);
|
---|
234 | expr.loadStride(firstRank);
|
---|
235 |
|
---|
236 | _bz_bool useUnitStride = iter.isUnitStride(firstRank)
|
---|
237 | && expr.isUnitStride(firstRank);
|
---|
238 |
|
---|
239 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
240 | int commonStride = expr.suggestStride(firstRank);
|
---|
241 | if (iter.suggestStride(firstRank) > commonStride)
|
---|
242 | commonStride = iter.suggestStride(firstRank);
|
---|
243 | bool useCommonStride = iter.isStride(firstRank,commonStride)
|
---|
244 | && expr.isStride(firstRank,commonStride);
|
---|
245 |
|
---|
246 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
247 | BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE:" << endl
|
---|
248 | << " commonStride = " << commonStride << " useCommonStride = "
|
---|
249 | << useCommonStride);
|
---|
250 | #endif
|
---|
251 | #else
|
---|
252 | int commonStride = 1;
|
---|
253 | bool useCommonStride = _bz_false;
|
---|
254 | #endif
|
---|
255 |
|
---|
256 | const T_numtype * last = iter.data() + length(firstRank)
|
---|
257 | * stride(firstRank);
|
---|
258 |
|
---|
259 | if (useUnitStride || useCommonStride)
|
---|
260 | {
|
---|
261 | #ifdef BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
262 |
|
---|
263 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
264 | BZ_DEBUG_MESSAGE("BZ_USE_FAST_READ_ARRAY_EXPR with commonStride");
|
---|
265 | #endif
|
---|
266 | int ubound = length(firstRank) * commonStride;
|
---|
267 | T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
|
---|
268 |
|
---|
269 | if (commonStride == 1)
|
---|
270 | {
|
---|
271 | #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
|
---|
272 | for (int i=0; i < ubound; ++i)
|
---|
273 | T_update::update(data[i], expr.fastRead(i));
|
---|
274 | #else
|
---|
275 | int n1 = ubound & 3;
|
---|
276 | int i = 0;
|
---|
277 | for (; i < n1; ++i)
|
---|
278 | T_update::update(data[i], expr.fastRead(i));
|
---|
279 |
|
---|
280 | for (; i < ubound; i += 4)
|
---|
281 | {
|
---|
282 | #ifndef BZ_ARRAY_STACK_TRAVERSAL_CSE_AND_ANTIALIAS
|
---|
283 | T_update::update(data[i], expr.fastRead(i));
|
---|
284 | T_update::update(data[i+1], expr.fastRead(i+1));
|
---|
285 | T_update::update(data[i+2], expr.fastRead(i+2));
|
---|
286 | T_update::update(data[i+3], expr.fastRead(i+3));
|
---|
287 | #else
|
---|
288 | int t1 = i+1;
|
---|
289 | int t2 = i+2;
|
---|
290 | int t3 = i+3;
|
---|
291 |
|
---|
292 | _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3, tmp4;
|
---|
293 |
|
---|
294 | tmp1 = expr.fastRead(i);
|
---|
295 | tmp2 = expr.fastRead(BZ_NO_PROPAGATE(t1));
|
---|
296 | tmp3 = expr.fastRead(BZ_NO_PROPAGATE(t2));
|
---|
297 | tmp4 = expr.fastRead(BZ_NO_PROPAGATE(t3));
|
---|
298 |
|
---|
299 | T_update::update(data[i], BZ_NO_PROPAGATE(tmp1));
|
---|
300 | T_update::update(data[BZ_NO_PROPAGATE(t1)], tmp2);
|
---|
301 | T_update::update(data[BZ_NO_PROPAGATE(t2)], tmp3);
|
---|
302 | T_update::update(data[BZ_NO_PROPAGATE(t3)], tmp4);
|
---|
303 | #endif
|
---|
304 | }
|
---|
305 | #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
|
---|
306 |
|
---|
307 | }
|
---|
308 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
309 | else {
|
---|
310 |
|
---|
311 | #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
|
---|
312 | for (int i=0; i < ubound; i += commonStride)
|
---|
313 | T_update::update(data[i], expr.fastRead(i));
|
---|
314 | #else
|
---|
315 | int n1 = (length(firstRank) & 3) * commonStride;
|
---|
316 |
|
---|
317 | int i = 0;
|
---|
318 | for (; i < n1; i += commonStride)
|
---|
319 | T_update::update(data[i], expr.fastRead(i));
|
---|
320 |
|
---|
321 | int strideInc = 4 * commonStride;
|
---|
322 | for (; i < ubound; i += strideInc)
|
---|
323 | {
|
---|
324 | T_update::update(data[i], expr.fastRead(i));
|
---|
325 | int i2 = i + commonStride;
|
---|
326 | T_update::update(data[i2], expr.fastRead(i2));
|
---|
327 | int i3 = i + 2 * commonStride;
|
---|
328 | T_update::update(data[i3], expr.fastRead(i3));
|
---|
329 | int i4 = i + 3 * commonStride;
|
---|
330 | T_update::update(data[i4], expr.fastRead(i4));
|
---|
331 | }
|
---|
332 | #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
|
---|
333 | }
|
---|
334 | #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
335 |
|
---|
336 | #else // ! BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
337 |
|
---|
338 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
339 | BZ_DEBUG_MESSAGE("Common stride, no fast read");
|
---|
340 | #endif
|
---|
341 | while (iter.data() != last)
|
---|
342 | {
|
---|
343 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
344 | iter.advance(commonStride);
|
---|
345 | expr.advance(commonStride);
|
---|
346 | }
|
---|
347 | #endif
|
---|
348 | }
|
---|
349 | else {
|
---|
350 | while (iter.data() != last)
|
---|
351 | {
|
---|
352 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
353 | iter.advance();
|
---|
354 | expr.advance();
|
---|
355 | }
|
---|
356 | }
|
---|
357 |
|
---|
358 | return *this;
|
---|
359 | }
|
---|
360 |
|
---|
361 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
362 | inline Array<T_numtype, N_rank>&
|
---|
363 | Array<T_numtype, N_rank>::evaluateWithStackTraversalN(_bz_ArrayExpr<T_expr>
|
---|
364 | expr, T_update)
|
---|
365 | {
|
---|
366 | /*
|
---|
367 | * A stack traversal replaces the usual nested loops:
|
---|
368 | *
|
---|
369 | * for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
|
---|
370 | * for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
|
---|
371 | * for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
|
---|
372 | * A(i,j,k) = 0;
|
---|
373 | *
|
---|
374 | * with a stack data structure. The stack allows this single
|
---|
375 | * routine to replace any number of nested loops.
|
---|
376 | *
|
---|
377 | * For each dimension (loop), these quantities are needed:
|
---|
378 | * - a pointer to the first element encountered in the loop
|
---|
379 | * - the stride associated with the dimension/loop
|
---|
380 | * - a pointer to the last element encountered in the loop
|
---|
381 | *
|
---|
382 | * The basic idea is that entering each loop is a "push" onto the
|
---|
383 | * stack, and exiting each loop is a "pop". In practice, this
|
---|
384 | * routine treats accesses the stack in a random-access way,
|
---|
385 | * which confuses the picture a bit. But conceptually, that's
|
---|
386 | * what is going on.
|
---|
387 | */
|
---|
388 |
|
---|
389 | /*
|
---|
390 | * ordering(0) gives the dimension associated with the smallest
|
---|
391 | * stride (usually; the exceptions have to do with subarrays and
|
---|
392 | * are uninteresting). We call this dimension maxRank; it will
|
---|
393 | * become the innermost "loop".
|
---|
394 | *
|
---|
395 | * Ordering the loops from ordering(N_rank-1) down to
|
---|
396 | * ordering(0) ensures that the largest stride is associated
|
---|
397 | * with the outermost loop, and the smallest stride with the
|
---|
398 | * innermost. This is critical for good performance on
|
---|
399 | * cached machines.
|
---|
400 | */
|
---|
401 | const int maxRank = ordering(0);
|
---|
402 | const int secondLastRank = ordering(1);
|
---|
403 |
|
---|
404 | // Create an iterator for the array receiving the result
|
---|
405 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
406 |
|
---|
407 | // Set the initial stack configuration by pushing the pointer
|
---|
408 | // to the first element of the array onto the stack N times.
|
---|
409 |
|
---|
410 | int i;
|
---|
411 | for (i=1; i < N_rank; ++i)
|
---|
412 | {
|
---|
413 | iter.push(i);
|
---|
414 | expr.push(i);
|
---|
415 | }
|
---|
416 |
|
---|
417 | // Load the strides associated with the innermost loop.
|
---|
418 | iter.loadStride(maxRank);
|
---|
419 | expr.loadStride(maxRank);
|
---|
420 |
|
---|
421 | /*
|
---|
422 | * Is the stride in the innermost loop equal to 1? If so,
|
---|
423 | * we might take advantage of this and generate more
|
---|
424 | * efficient code.
|
---|
425 | */
|
---|
426 | _bz_bool useUnitStride = iter.isUnitStride(maxRank)
|
---|
427 | && expr.isUnitStride(maxRank);
|
---|
428 |
|
---|
429 | /*
|
---|
430 | * Do all array operands share a common stride in the innermost
|
---|
431 | * loop? If so, we can generate more efficient code (but only
|
---|
432 | * if this optimization has been enabled).
|
---|
433 | */
|
---|
434 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
435 | int commonStride = expr.suggestStride(maxRank);
|
---|
436 | if (iter.suggestStride(maxRank) > commonStride)
|
---|
437 | commonStride = iter.suggestStride(maxRank);
|
---|
438 | bool useCommonStride = iter.isStride(maxRank,commonStride)
|
---|
439 | && expr.isStride(maxRank,commonStride);
|
---|
440 |
|
---|
441 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
442 | BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl
|
---|
443 | << "commonStride = " << commonStride << " useCommonStride = "
|
---|
444 | << useCommonStride);
|
---|
445 | #endif
|
---|
446 |
|
---|
447 | #else
|
---|
448 | int commonStride = 1;
|
---|
449 | bool useCommonStride = _bz_false;
|
---|
450 | #endif
|
---|
451 |
|
---|
452 | /*
|
---|
453 | * The "last" array contains a pointer to the last element
|
---|
454 | * encountered in each "loop".
|
---|
455 | */
|
---|
456 | const T_numtype* _bz_restrict last[N_rank];
|
---|
457 |
|
---|
458 | // Set up the initial state of the "last" array
|
---|
459 | for (i=1; i < N_rank; ++i)
|
---|
460 | last[i] = iter.data() + length(ordering(i)) * stride(ordering(i));
|
---|
461 |
|
---|
462 | int lastLength = length(maxRank);
|
---|
463 | int firstNoncollapsedLoop = 1;
|
---|
464 |
|
---|
465 | #ifdef BZ_COLLAPSE_LOOPS
|
---|
466 |
|
---|
467 | /*
|
---|
468 | * This bit of code handles collapsing loops. When possible,
|
---|
469 | * the N nested loops are converted into a single loop (basically,
|
---|
470 | * the N-dimensional array is treated as a long vector).
|
---|
471 | * This is important for cases where the length of the innermost
|
---|
472 | * loop is very small, for example a 100x100x3 array.
|
---|
473 | * If this code can't collapse all the loops into a single loop,
|
---|
474 | * it will collapse as many loops as possible starting from the
|
---|
475 | * innermost and working out.
|
---|
476 | */
|
---|
477 |
|
---|
478 | // Collapse loops when possible
|
---|
479 | for (i=1; i < N_rank; ++i)
|
---|
480 | {
|
---|
481 | // Figure out which pair of loops we are considering combining.
|
---|
482 | int outerLoopRank = ordering(i);
|
---|
483 | int innerLoopRank = ordering(i-1);
|
---|
484 |
|
---|
485 | /*
|
---|
486 | * The canCollapse() routines look at the strides and extents
|
---|
487 | * of the loops, and determine if they can be combined into
|
---|
488 | * one loop.
|
---|
489 | */
|
---|
490 |
|
---|
491 | if (canCollapse(outerLoopRank,innerLoopRank)
|
---|
492 | && expr.canCollapse(outerLoopRank,innerLoopRank))
|
---|
493 | {
|
---|
494 | lastLength *= length(outerLoopRank);
|
---|
495 | firstNoncollapsedLoop = i+1;
|
---|
496 | }
|
---|
497 | }
|
---|
498 | #endif // BZ_COLLAPSE_LOOPS
|
---|
499 |
|
---|
500 | /*
|
---|
501 | * Now we actually perform the loops. This while loop contains
|
---|
502 | * two parts: first, the innermost loop is performed. Then we
|
---|
503 | * exit the loop, and pop our way down the stack until we find
|
---|
504 | * a loop that isn't completed. We then restart the inner loops
|
---|
505 | * and push them onto the stack.
|
---|
506 | */
|
---|
507 |
|
---|
508 | while (true) {
|
---|
509 |
|
---|
510 | /*
|
---|
511 | * This bit of code handles the innermost loop. It would look
|
---|
512 | * a lot simpler if it weren't for unit stride and common stride
|
---|
513 | * optimizations; these clutter up the code with multiple versions.
|
---|
514 | */
|
---|
515 |
|
---|
516 | if ((useUnitStride) || (useCommonStride))
|
---|
517 | {
|
---|
518 | T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
|
---|
519 | + lastLength;
|
---|
520 |
|
---|
521 | #ifdef BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
522 |
|
---|
523 | /*
|
---|
524 | * The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably
|
---|
525 | * be taken out. This was put in place while the unit stride/
|
---|
526 | * common stride optimizations were being implemented and
|
---|
527 | * tested.
|
---|
528 | */
|
---|
529 |
|
---|
530 | // Calculate the end of the innermost loop
|
---|
531 | int ubound = lastLength * commonStride;
|
---|
532 |
|
---|
533 | /*
|
---|
534 | * This is a real kludge. I didn't want to have to write
|
---|
535 | * a const and non-const version of ArrayIterator, so I use a
|
---|
536 | * const iterator and cast away const. This could
|
---|
537 | * probably be avoided with some trick, but the whole routine
|
---|
538 | * is ugly, so why bother.
|
---|
539 | */
|
---|
540 |
|
---|
541 | T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
|
---|
542 |
|
---|
543 | /*
|
---|
544 | * BZ_NEEDS_WORK-- need to implement optional unrolling.
|
---|
545 | */
|
---|
546 | if (commonStride == 1)
|
---|
547 | {
|
---|
548 | for (int i=0; i < ubound; ++i)
|
---|
549 | T_update::update(data[i], expr.fastRead(i));
|
---|
550 | }
|
---|
551 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
552 | else {
|
---|
553 | for (int i=0; i < ubound; i += commonStride)
|
---|
554 | T_update::update(data[i], expr.fastRead(i));
|
---|
555 | }
|
---|
556 | #endif
|
---|
557 | /*
|
---|
558 | * Tidy up for the fact that we haven't actually been
|
---|
559 | * incrementing the iterators in the innermost loop, by
|
---|
560 | * faking it afterward.
|
---|
561 | */
|
---|
562 | iter.advance(lastLength * commonStride);
|
---|
563 | expr.advance(lastLength * commonStride);
|
---|
564 | #else
|
---|
565 | // !BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
566 | // This bit of code not really needed; should remove at some
|
---|
567 | // point, along with the test for BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
568 |
|
---|
569 | while (iter.data() != end)
|
---|
570 | {
|
---|
571 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
572 | iter.advance(commonStride);
|
---|
573 | expr.advance(commonStride);
|
---|
574 | }
|
---|
575 | #endif
|
---|
576 | }
|
---|
577 | else {
|
---|
578 | /*
|
---|
579 | * We don't have a unit stride or common stride in the innermost
|
---|
580 | * loop. This is going to hurt performance. Luckily 95% of
|
---|
581 | * the time, we hit the cases above.
|
---|
582 | */
|
---|
583 | T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
|
---|
584 | + lastLength * stride(maxRank);
|
---|
585 |
|
---|
586 | while (iter.data() != end)
|
---|
587 | {
|
---|
588 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
589 | iter.advance();
|
---|
590 | expr.advance();
|
---|
591 | }
|
---|
592 | }
|
---|
593 |
|
---|
594 |
|
---|
595 | /*
|
---|
596 | * We just finished the innermost loop. Now we pop our way down
|
---|
597 | * the stack, until we hit a loop that hasn't completed yet.
|
---|
598 | */
|
---|
599 | int j = firstNoncollapsedLoop;
|
---|
600 | for (; j < N_rank; ++j)
|
---|
601 | {
|
---|
602 | // Get the next loop
|
---|
603 | int r = ordering(j);
|
---|
604 |
|
---|
605 | // Pop-- this restores the data pointers to the first element
|
---|
606 | // encountered in the loop.
|
---|
607 | iter.pop(j);
|
---|
608 | expr.pop(j);
|
---|
609 |
|
---|
610 | // Load the stride associated with this loop, and increment
|
---|
611 | // once.
|
---|
612 | iter.loadStride(r);
|
---|
613 | expr.loadStride(r);
|
---|
614 | iter.advance();
|
---|
615 | expr.advance();
|
---|
616 |
|
---|
617 | // If we aren't at the end of this loop, then stop popping.
|
---|
618 | if (iter.data() != last[j])
|
---|
619 | break;
|
---|
620 | }
|
---|
621 |
|
---|
622 | // Are we completely done?
|
---|
623 | if (j == N_rank)
|
---|
624 | break;
|
---|
625 |
|
---|
626 | // No, so push all the inner loops back onto the stack.
|
---|
627 | for (; j >= firstNoncollapsedLoop; --j)
|
---|
628 | {
|
---|
629 | int r2 = ordering(j-1);
|
---|
630 | iter.push(j);
|
---|
631 | expr.push(j);
|
---|
632 | last[j-1] = iter.data() + length(r2) * stride(r2);
|
---|
633 | }
|
---|
634 |
|
---|
635 | // Load the stride for the innermost loop again.
|
---|
636 | iter.loadStride(maxRank);
|
---|
637 | expr.loadStride(maxRank);
|
---|
638 | }
|
---|
639 |
|
---|
640 | return *this;
|
---|
641 | }
|
---|
642 |
|
---|
643 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
644 | inline Array<T_numtype, N_rank>&
|
---|
645 | Array<T_numtype, N_rank>::evaluateWithIndexTraversal1(_bz_ArrayExpr<T_expr>
|
---|
646 | expr, T_update)
|
---|
647 | {
|
---|
648 | TinyVector<int,N_rank> index;
|
---|
649 |
|
---|
650 | if (stride(firstRank) == 1)
|
---|
651 | {
|
---|
652 | T_numtype * _bz_restrict iter = data_;
|
---|
653 | int last = ubound(firstRank);
|
---|
654 |
|
---|
655 | for (index[0] = lbound(firstRank); index[0] <= last;
|
---|
656 | ++index[0])
|
---|
657 | {
|
---|
658 | iter[index[0]] = expr(index);
|
---|
659 | }
|
---|
660 | }
|
---|
661 | else {
|
---|
662 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
663 | iter.loadStride(0);
|
---|
664 | int last = ubound(firstRank);
|
---|
665 |
|
---|
666 | for (index[0] = lbound(firstRank); index[0] <= last;
|
---|
667 | ++index[0])
|
---|
668 | {
|
---|
669 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
670 | expr(index));
|
---|
671 | iter.advance();
|
---|
672 | }
|
---|
673 | }
|
---|
674 |
|
---|
675 | return *this;
|
---|
676 | }
|
---|
677 |
|
---|
678 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
679 | inline Array<T_numtype, N_rank>&
|
---|
680 | Array<T_numtype, N_rank>::evaluateWithIndexTraversalN(_bz_ArrayExpr<T_expr>
|
---|
681 | expr, T_update)
|
---|
682 | {
|
---|
683 | // Do a stack-type traversal for the destination array and use
|
---|
684 | // index traversal for the source expression
|
---|
685 |
|
---|
686 | const int maxRank = ordering(0);
|
---|
687 | const int secondLastRank = ordering(1);
|
---|
688 |
|
---|
689 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
690 | cout << "Index traversal: N_rank = " << N_rank << endl;
|
---|
691 | cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
|
---|
692 | << endl;
|
---|
693 | cout.flush();
|
---|
694 | #endif
|
---|
695 |
|
---|
696 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
697 | for (int i=1; i < N_rank; ++i)
|
---|
698 | iter.push(ordering(i));
|
---|
699 |
|
---|
700 | iter.loadStride(maxRank);
|
---|
701 |
|
---|
702 | TinyVector<int,N_rank> index, last;
|
---|
703 |
|
---|
704 | index = storage_.base();
|
---|
705 | last = storage_.base() + length_;
|
---|
706 |
|
---|
707 | int lastLength = length(maxRank);
|
---|
708 |
|
---|
709 | while (true) {
|
---|
710 |
|
---|
711 | for (index[maxRank] = base(maxRank);
|
---|
712 | index[maxRank] < last[maxRank];
|
---|
713 | ++index[maxRank])
|
---|
714 | {
|
---|
715 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
716 | #if 0
|
---|
717 | cout << "(" << index[0] << "," << index[1] << ") " << endl;
|
---|
718 | cout.flush();
|
---|
719 | #endif
|
---|
720 | #endif
|
---|
721 |
|
---|
722 | T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index));
|
---|
723 | iter.advance();
|
---|
724 | }
|
---|
725 |
|
---|
726 | int j = 1;
|
---|
727 | for (; j < N_rank; ++j)
|
---|
728 | {
|
---|
729 | iter.pop(ordering(j));
|
---|
730 | iter.loadStride(ordering(j));
|
---|
731 | iter.advance();
|
---|
732 |
|
---|
733 | index[ordering(j-1)] = base(ordering(j-1));
|
---|
734 | ++index[ordering(j)];
|
---|
735 | if (index[ordering(j)] != last[ordering(j)])
|
---|
736 | break;
|
---|
737 | }
|
---|
738 |
|
---|
739 | if (j == N_rank)
|
---|
740 | break;
|
---|
741 |
|
---|
742 | for (; j > 0; --j)
|
---|
743 | {
|
---|
744 | iter.push(ordering(j));
|
---|
745 | }
|
---|
746 | iter.loadStride(maxRank);
|
---|
747 | }
|
---|
748 |
|
---|
749 | return *this;
|
---|
750 | }
|
---|
751 |
|
---|
752 | // Fast traversals require <set> from the ISO/ANSI C++ standard library
|
---|
753 |
|
---|
754 | #ifdef BZ_HAVE_STD
|
---|
755 |
|
---|
756 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
757 | inline Array<T_numtype, N_rank>&
|
---|
758 | Array<T_numtype, N_rank>::evaluateWithFastTraversal(
|
---|
759 | const TraversalOrder<N_rank - 1>& order, _bz_ArrayExpr<T_expr> expr,
|
---|
760 | T_update)
|
---|
761 | {
|
---|
762 | const int maxRank = ordering(0);
|
---|
763 | const int secondLastRank = ordering(1);
|
---|
764 |
|
---|
765 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
766 | cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
|
---|
767 | << endl;
|
---|
768 | #endif
|
---|
769 |
|
---|
770 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
771 | iter.push(0);
|
---|
772 | expr.push(0);
|
---|
773 |
|
---|
774 | _bz_bool useUnitStride = iter.isUnitStride(maxRank)
|
---|
775 | && expr.isUnitStride(maxRank);
|
---|
776 |
|
---|
777 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
778 | int commonStride = expr.suggestStride(maxRank);
|
---|
779 | if (iter.suggestStride(maxRank) > commonStride)
|
---|
780 | commonStride = iter.suggestStride(maxRank);
|
---|
781 | bool useCommonStride = iter.isStride(maxRank,commonStride)
|
---|
782 | && expr.isStride(maxRank,commonStride);
|
---|
783 | #else
|
---|
784 | int commonStride = 1;
|
---|
785 | bool useCommonStride = _bz_false;
|
---|
786 | #endif
|
---|
787 |
|
---|
788 | int lastLength = length(maxRank);
|
---|
789 |
|
---|
790 | for (int i=0; i < order.length(); ++i)
|
---|
791 | {
|
---|
792 | iter.pop(0);
|
---|
793 | expr.pop(0);
|
---|
794 |
|
---|
795 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
796 | cerr << "Traversing: " << order[i] << endl;
|
---|
797 | #endif
|
---|
798 | // Position the iterator at the start of the next column
|
---|
799 | for (int j=1; j < N_rank; ++j)
|
---|
800 | {
|
---|
801 | iter.loadStride(ordering(j));
|
---|
802 | expr.loadStride(ordering(j));
|
---|
803 |
|
---|
804 | int offset = order[i][j-1];
|
---|
805 | iter.advance(offset);
|
---|
806 | expr.advance(offset);
|
---|
807 | }
|
---|
808 |
|
---|
809 | iter.loadStride(maxRank);
|
---|
810 | expr.loadStride(maxRank);
|
---|
811 |
|
---|
812 | // Evaluate the expression along the column
|
---|
813 |
|
---|
814 | if ((useUnitStride) || (useCommonStride))
|
---|
815 | {
|
---|
816 | T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
|
---|
817 | + lastLength * commonStride;
|
---|
818 |
|
---|
819 | #ifdef BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
820 | int ubound = lastLength * commonStride;
|
---|
821 | T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
|
---|
822 |
|
---|
823 | if (commonStride == 1)
|
---|
824 | {
|
---|
825 | #ifndef BZ_ARRAY_FAST_TRAVERSAL_UNROLL
|
---|
826 | for (int i=0; i < ubound; ++i)
|
---|
827 | T_update::update(data[i], expr.fastRead(i));
|
---|
828 | #else
|
---|
829 | int n1 = ubound & 3;
|
---|
830 | int i=0;
|
---|
831 | for (; i < n1; ++i)
|
---|
832 | T_update::update(data[i], expr.fastRead(i));
|
---|
833 |
|
---|
834 | for (; i < ubound; i += 4)
|
---|
835 | {
|
---|
836 | T_update::update(data[i], expr.fastRead(i));
|
---|
837 | T_update::update(data[i+1], expr.fastRead(i+1));
|
---|
838 | T_update::update(data[i+2], expr.fastRead(i+2));
|
---|
839 | T_update::update(data[i+3], expr.fastRead(i+3));
|
---|
840 | }
|
---|
841 | #endif // BZ_ARRAY_FAST_TRAVERSAL_UNROLL
|
---|
842 | }
|
---|
843 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
844 | else {
|
---|
845 | for (int i=0; i < ubound; i += commonStride)
|
---|
846 | T_update::update(data[i], expr.fastRead(i));
|
---|
847 | }
|
---|
848 | #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
849 |
|
---|
850 | iter.advance(lastLength * commonStride);
|
---|
851 | expr.advance(lastLength * commonStride);
|
---|
852 | #else // ! BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
853 | while (iter.data() != last)
|
---|
854 | {
|
---|
855 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
856 | iter.advance(commonStride);
|
---|
857 | expr.advance(commonStride);
|
---|
858 | }
|
---|
859 | #endif // BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
860 |
|
---|
861 | }
|
---|
862 | else {
|
---|
863 | // No common stride
|
---|
864 |
|
---|
865 | T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
|
---|
866 | + lastLength * stride(maxRank);
|
---|
867 |
|
---|
868 | while (iter.data() != last)
|
---|
869 | {
|
---|
870 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
871 | iter.advance();
|
---|
872 | expr.advance();
|
---|
873 | }
|
---|
874 | }
|
---|
875 | }
|
---|
876 |
|
---|
877 | return *this;
|
---|
878 | }
|
---|
879 | #endif // BZ_HAVE_STD
|
---|
880 |
|
---|
881 | #ifdef BZ_ARRAY_2D_NEW_STENCIL_TILING
|
---|
882 |
|
---|
883 | #ifdef BZ_ARRAY_2D_STENCIL_TILING
|
---|
884 |
|
---|
885 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
886 | inline Array<T_numtype, N_rank>&
|
---|
887 | Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
|
---|
888 | expr, T_update)
|
---|
889 | {
|
---|
890 | const int minorRank = ordering(0);
|
---|
891 | const int majorRank = ordering(1);
|
---|
892 |
|
---|
893 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
894 | iter.push(0);
|
---|
895 | expr.push(0);
|
---|
896 |
|
---|
897 | #ifdef BZ_2D_STENCIL_DEBUG
|
---|
898 | int count = 0;
|
---|
899 | #endif
|
---|
900 |
|
---|
901 | _bz_bool useUnitStride = iter.isUnitStride(minorRank)
|
---|
902 | && expr.isUnitStride(minorRank);
|
---|
903 |
|
---|
904 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
905 | int commonStride = expr.suggestStride(minorRank);
|
---|
906 | if (iter.suggestStride(minorRank) > commonStride)
|
---|
907 | commonStride = iter.suggestStride(minorRank);
|
---|
908 | bool useCommonStride = iter.isStride(minorRank,commonStride)
|
---|
909 | && expr.isStride(minorRank,commonStride);
|
---|
910 | #else
|
---|
911 | int commonStride = 1;
|
---|
912 | bool useCommonStride = _bz_false;
|
---|
913 | #endif
|
---|
914 |
|
---|
915 | // Determine if a common major stride exists
|
---|
916 | int commonMajorStride = expr.suggestStride(majorRank);
|
---|
917 | if (iter.suggestStride(majorRank) > commonMajorStride)
|
---|
918 | commonMajorStride = iter.suggestStride(majorRank);
|
---|
919 | bool haveCommonMajorStride = iter.isStride(majorRank,commonMajorStride)
|
---|
920 | && expr.isStride(majorRank,commonMajorStride);
|
---|
921 |
|
---|
922 |
|
---|
923 | int maxi = length(majorRank);
|
---|
924 | int maxj = length(minorRank);
|
---|
925 |
|
---|
926 | const int tileHeight = 16, tileWidth = 3;
|
---|
927 |
|
---|
928 | int bi, bj;
|
---|
929 | for (bi=0; bi < maxi; bi += tileHeight)
|
---|
930 | {
|
---|
931 | int ni = bi + tileHeight;
|
---|
932 | if (ni > maxi)
|
---|
933 | ni = maxi;
|
---|
934 |
|
---|
935 | // Move back to the beginning of the array
|
---|
936 | iter.pop(0);
|
---|
937 | expr.pop(0);
|
---|
938 |
|
---|
939 | // Move to the start of this tile row
|
---|
940 | iter.loadStride(majorRank);
|
---|
941 | iter.advance(bi);
|
---|
942 | expr.loadStride(majorRank);
|
---|
943 | expr.advance(bi);
|
---|
944 |
|
---|
945 | // Save this position
|
---|
946 | iter.push(1);
|
---|
947 | expr.push(1);
|
---|
948 |
|
---|
949 | for (bj=0; bj < maxj; bj += tileWidth)
|
---|
950 | {
|
---|
951 | // Move to the beginning of the tile row
|
---|
952 | iter.pop(1);
|
---|
953 | expr.pop(1);
|
---|
954 |
|
---|
955 | // Move to the top of the current tile (bi,bj)
|
---|
956 | iter.loadStride(minorRank);
|
---|
957 | iter.advance(bj);
|
---|
958 | expr.loadStride(minorRank);
|
---|
959 | expr.advance(bj);
|
---|
960 |
|
---|
961 | if (bj + tileWidth <= maxj)
|
---|
962 | {
|
---|
963 | // Strip mining
|
---|
964 |
|
---|
965 | if ((useUnitStride) && (haveCommonMajorStride))
|
---|
966 | {
|
---|
967 | int offset = 0;
|
---|
968 | T_numtype* _bz_restrict data = const_cast<T_numtype*>
|
---|
969 | (iter.data());
|
---|
970 |
|
---|
971 | for (int i=bi; i < ni; ++i)
|
---|
972 | {
|
---|
973 | _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3;
|
---|
974 |
|
---|
975 | // Common subexpression elimination -- compilers
|
---|
976 | // won't necessarily do this on their own.
|
---|
977 | int t1 = offset+1;
|
---|
978 | int t2 = offset+2;
|
---|
979 |
|
---|
980 | tmp1 = expr.fastRead(offset);
|
---|
981 | tmp2 = expr.fastRead(t1);
|
---|
982 | tmp3 = expr.fastRead(t2);
|
---|
983 |
|
---|
984 | T_update::update(data[0], tmp1);
|
---|
985 | T_update::update(data[1], tmp2);
|
---|
986 | T_update::update(data[2], tmp3);
|
---|
987 |
|
---|
988 | offset += commonMajorStride;
|
---|
989 | data += commonMajorStride;
|
---|
990 |
|
---|
991 | #ifdef BZ_2D_STENCIL_DEBUG
|
---|
992 | count += 3;
|
---|
993 | #endif
|
---|
994 | }
|
---|
995 | }
|
---|
996 | else {
|
---|
997 |
|
---|
998 | for (int i=bi; i < ni; ++i)
|
---|
999 | {
|
---|
1000 | iter.loadStride(minorRank);
|
---|
1001 | expr.loadStride(minorRank);
|
---|
1002 |
|
---|
1003 | // Loop through current row elements
|
---|
1004 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
1005 | *expr);
|
---|
1006 | iter.advance();
|
---|
1007 | expr.advance();
|
---|
1008 |
|
---|
1009 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
1010 | *expr);
|
---|
1011 | iter.advance();
|
---|
1012 | expr.advance();
|
---|
1013 |
|
---|
1014 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
1015 | *expr);
|
---|
1016 | iter.advance(-2);
|
---|
1017 | expr.advance(-2);
|
---|
1018 |
|
---|
1019 | iter.loadStride(majorRank);
|
---|
1020 | expr.loadStride(majorRank);
|
---|
1021 | iter.advance();
|
---|
1022 | expr.advance();
|
---|
1023 |
|
---|
1024 | #ifdef BZ_2D_STENCIL_DEBUG
|
---|
1025 | count += 3;
|
---|
1026 | #endif
|
---|
1027 |
|
---|
1028 | }
|
---|
1029 | }
|
---|
1030 | }
|
---|
1031 | else {
|
---|
1032 |
|
---|
1033 | // This code handles partial tiles at the bottom of the
|
---|
1034 | // array.
|
---|
1035 |
|
---|
1036 | for (int j=bj; j < maxj; ++j)
|
---|
1037 | {
|
---|
1038 | iter.loadStride(majorRank);
|
---|
1039 | expr.loadStride(majorRank);
|
---|
1040 |
|
---|
1041 | for (int i=bi; i < ni; ++i)
|
---|
1042 | {
|
---|
1043 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
1044 | *expr);
|
---|
1045 | iter.advance();
|
---|
1046 | expr.advance();
|
---|
1047 | #ifdef BZ_2D_STENCIL_DEBUG
|
---|
1048 | ++count;
|
---|
1049 | #endif
|
---|
1050 |
|
---|
1051 | }
|
---|
1052 |
|
---|
1053 | // Move back to the top of this column
|
---|
1054 | iter.advance(bi-ni);
|
---|
1055 | expr.advance(bi-ni);
|
---|
1056 |
|
---|
1057 | // Move over to the next column
|
---|
1058 | iter.loadStride(minorRank);
|
---|
1059 | expr.loadStride(minorRank);
|
---|
1060 |
|
---|
1061 | iter.advance();
|
---|
1062 | expr.advance();
|
---|
1063 | }
|
---|
1064 | }
|
---|
1065 | }
|
---|
1066 | }
|
---|
1067 |
|
---|
1068 | #ifdef BZ_2D_STENCIL_DEBUG
|
---|
1069 | cout << "BZ_2D_STENCIL_DEBUG: count = " << count << endl;
|
---|
1070 | #endif
|
---|
1071 |
|
---|
1072 | return *this;
|
---|
1073 | }
|
---|
1074 |
|
---|
1075 | #endif // BZ_ARRAY_2D_STENCIL_TILING
|
---|
1076 | #endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
|
---|
1077 |
|
---|
1078 |
|
---|
1079 |
|
---|
1080 | #ifndef BZ_ARRAY_2D_NEW_STENCIL_TILING
|
---|
1081 |
|
---|
1082 | #ifdef BZ_ARRAY_2D_STENCIL_TILING
|
---|
1083 |
|
---|
1084 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
1085 | inline Array<T_numtype, N_rank>&
|
---|
1086 | Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
|
---|
1087 | expr, T_update)
|
---|
1088 | {
|
---|
1089 | const int minorRank = ordering(0);
|
---|
1090 | const int majorRank = ordering(1);
|
---|
1091 |
|
---|
1092 | const int blockSize = 16;
|
---|
1093 |
|
---|
1094 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
1095 | iter.push(0);
|
---|
1096 | expr.push(0);
|
---|
1097 |
|
---|
1098 | _bz_bool useUnitStride = iter.isUnitStride(minorRank)
|
---|
1099 | && expr.isUnitStride(minorRank);
|
---|
1100 |
|
---|
1101 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
1102 | int commonStride = expr.suggestStride(minorRank);
|
---|
1103 | if (iter.suggestStride(minorRank) > commonStride)
|
---|
1104 | commonStride = iter.suggestStride(minorRank);
|
---|
1105 | bool useCommonStride = iter.isStride(minorRank,commonStride)
|
---|
1106 | && expr.isStride(minorRank,commonStride);
|
---|
1107 | #else
|
---|
1108 | int commonStride = 1;
|
---|
1109 | bool useCommonStride = _bz_false;
|
---|
1110 | #endif
|
---|
1111 |
|
---|
1112 | int maxi = length(majorRank);
|
---|
1113 | int maxj = length(minorRank);
|
---|
1114 |
|
---|
1115 | int bi, bj;
|
---|
1116 | for (bi=0; bi < maxi; bi += blockSize)
|
---|
1117 | {
|
---|
1118 | int ni = bi + blockSize;
|
---|
1119 | if (ni > maxi)
|
---|
1120 | ni = maxi;
|
---|
1121 |
|
---|
1122 | for (bj=0; bj < maxj; bj += blockSize)
|
---|
1123 | {
|
---|
1124 | int nj = bj + blockSize;
|
---|
1125 | if (nj > maxj)
|
---|
1126 | nj = maxj;
|
---|
1127 |
|
---|
1128 | // Move to the beginning of the array
|
---|
1129 | iter.pop(0);
|
---|
1130 | expr.pop(0);
|
---|
1131 |
|
---|
1132 | // Move to the beginning of the tile (bi,bj)
|
---|
1133 | iter.loadStride(majorRank);
|
---|
1134 | iter.advance(bi);
|
---|
1135 | iter.loadStride(minorRank);
|
---|
1136 | iter.advance(bj);
|
---|
1137 |
|
---|
1138 | expr.loadStride(majorRank);
|
---|
1139 | expr.advance(bi);
|
---|
1140 | expr.loadStride(minorRank);
|
---|
1141 | expr.advance(bj);
|
---|
1142 |
|
---|
1143 | // Loop through tile rows
|
---|
1144 | for (int i=bi; i < ni; ++i)
|
---|
1145 | {
|
---|
1146 | // Save the beginning of this tile row
|
---|
1147 | iter.push(1);
|
---|
1148 | expr.push(1);
|
---|
1149 |
|
---|
1150 | // Load the minor stride
|
---|
1151 | iter.loadStride(minorRank);
|
---|
1152 | expr.loadStride(minorRank);
|
---|
1153 |
|
---|
1154 | if (useUnitStride)
|
---|
1155 | {
|
---|
1156 | T_numtype* _bz_restrict data = const_cast<T_numtype*>
|
---|
1157 | (iter.data());
|
---|
1158 |
|
---|
1159 | int ubound = (nj-bj);
|
---|
1160 | for (int j=0; j < ubound; ++j)
|
---|
1161 | T_update::update(data[j], expr.fastRead(j));
|
---|
1162 | }
|
---|
1163 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
1164 | else if (useCommonStride)
|
---|
1165 | {
|
---|
1166 | int ubound = (nj-bj) * commonStride;
|
---|
1167 | T_numtype* _bz_restrict data = const_cast<T_numtype*>
|
---|
1168 | (iter.data());
|
---|
1169 |
|
---|
1170 | for (int j=0; j < ubound; j += commonStride)
|
---|
1171 | T_update::update(data[j], expr.fastRead(j));
|
---|
1172 | }
|
---|
1173 | #endif
|
---|
1174 | else {
|
---|
1175 | for (int j=bj; j < nj; ++j)
|
---|
1176 | {
|
---|
1177 | // Loop through current row elements
|
---|
1178 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
1179 | *expr);
|
---|
1180 | iter.advance();
|
---|
1181 | expr.advance();
|
---|
1182 | }
|
---|
1183 | }
|
---|
1184 |
|
---|
1185 | // Move back to the beginning of the tile row, then
|
---|
1186 | // move to the next row
|
---|
1187 | iter.pop(1);
|
---|
1188 | iter.loadStride(majorRank);
|
---|
1189 | iter.advance(1);
|
---|
1190 |
|
---|
1191 | expr.pop(1);
|
---|
1192 | expr.loadStride(majorRank);
|
---|
1193 | expr.advance(1);
|
---|
1194 | }
|
---|
1195 | }
|
---|
1196 | }
|
---|
1197 |
|
---|
1198 | return *this;
|
---|
1199 | }
|
---|
1200 | #endif // BZ_ARRAY_2D_STENCIL_TILING
|
---|
1201 | #endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
|
---|
1202 |
|
---|
1203 | BZ_NAMESPACE_END
|
---|
1204 |
|
---|
1205 | #endif // BZ_ARRAYEVAL_CC
|
---|
1206 |
|
---|