1 | /*
|
---|
2 | * $Id: eval.cc,v 1.1.1.1 1999-11-26 16:37:06 ansari Exp $
|
---|
3 | *
|
---|
4 | * $Log: not supported by cvs2svn $
|
---|
5 | // Revision 1.1.1.1 1999/04/09 17:59:03 ansari
|
---|
6 | // Creation module DPC/Blitz (blitz 0.4) Reza 09/04/99
|
---|
7 | //
|
---|
8 | * Revision 1.2 1998/03/14 00:04:47 tveldhui
|
---|
9 | * 0.2-alpha-05
|
---|
10 | *
|
---|
11 | * Revision 1.1 1998/02/25 20:04:01 tveldhui
|
---|
12 | * Initial revision
|
---|
13 | *
|
---|
14 | */
|
---|
15 |
|
---|
16 | #ifndef BZ_ARRAYEVAL_CC
|
---|
17 | #define BZ_ARRAYEVAL_CC
|
---|
18 |
|
---|
19 | #ifndef BZ_ARRAY_H
|
---|
20 | #error <blitz/array/eval.cc> must be included via <blitz/array.h>
|
---|
21 | #endif
|
---|
22 |
|
---|
23 | BZ_NAMESPACE(blitz)
|
---|
24 |
|
---|
25 | /*
|
---|
26 | * Assign an expression to an array. For performance reasons, there are
|
---|
27 | * several traversal mechanisms:
|
---|
28 | *
|
---|
29 | * - Index traversal scans through the destination array in storage order.
|
---|
30 | * The expression is evaluated using a TinyVector<int,N> operand. This
|
---|
31 | * version is used only when there are index placeholders in the expression
|
---|
32 | * (see <blitz/indexexpr.h>)
|
---|
33 | * - Stack traversal also scans through the destination array in storage
|
---|
34 | * order. However, push/pop stack iterators are used.
|
---|
35 | * - Fast traversal follows a Hilbert (or other) space-filling curve to
|
---|
36 | * improve cache reuse for stencilling operations. Currently, the
|
---|
37 | * space filling curves must be generated by calling
|
---|
38 | * generateFastTraversalOrder(TinyVector<int,N_dimensions>).
|
---|
39 | * - 2D tiled traversal follows a tiled traversal, to improve cache reuse
|
---|
40 | * for 2D stencils. Space filling curves have too much overhead to use
|
---|
41 | * in two-dimensions.
|
---|
42 | *
|
---|
43 | * _bz_tryFastTraversal is a helper class. Fast traversals are only
|
---|
44 | * attempted if the expression looks like a stencil -- it's at least
|
---|
45 | * three-dimensional, has at least six array operands, and there are
|
---|
46 | * no index placeholders in the expression. These are all things which
|
---|
47 | * can be checked at compile time, so the if()/else() syntax has been
|
---|
48 | * replaced with this class template.
|
---|
49 | */
|
---|
50 |
|
---|
51 | // Fast traversals require <set> from the ISO/ANSI C++ standard library
|
---|
52 | #ifdef BZ_HAVE_STD
|
---|
53 |
|
---|
54 | template<_bz_bool canTryFastTraversal>
|
---|
55 | struct _bz_tryFastTraversal {
|
---|
56 | template<class T_numtype, int N_rank, class T_expr, class T_update>
|
---|
57 | static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
|
---|
58 | _bz_ArrayExpr<T_expr> expr, T_update)
|
---|
59 | {
|
---|
60 | return _bz_false;
|
---|
61 | }
|
---|
62 | };
|
---|
63 |
|
---|
64 | template<>
|
---|
65 | struct _bz_tryFastTraversal<_bz_true> {
|
---|
66 | template<class T_numtype, int N_rank, class T_expr, class T_update>
|
---|
67 | static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
|
---|
68 | _bz_ArrayExpr<T_expr> expr, T_update)
|
---|
69 | {
|
---|
70 | // See if there's an appropriate space filling curve available.
|
---|
71 | // Currently fast traversals use an N-1 dimensional curve. The
|
---|
72 | // Nth dimension column corresponding to each point on the curve
|
---|
73 | // is traversed in the normal fashion.
|
---|
74 | TraversalOrderCollection<N_rank-1> traversals;
|
---|
75 | TinyVector<int, N_rank - 1> traversalGridSize;
|
---|
76 |
|
---|
77 | for (int i=0; i < N_rank - 1; ++i)
|
---|
78 | traversalGridSize[i] = array.length(array.ordering(i+1));
|
---|
79 |
|
---|
80 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
81 | cout << "traversalGridSize = " << traversalGridSize << endl;
|
---|
82 | cout.flush();
|
---|
83 | #endif
|
---|
84 |
|
---|
85 | const TraversalOrder<N_rank-1>* order =
|
---|
86 | traversals.find(traversalGridSize);
|
---|
87 |
|
---|
88 | if (order)
|
---|
89 | {
|
---|
90 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
91 | cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
|
---|
92 | << ", " << N_rank << ">: Using stack traversal" << endl;
|
---|
93 | #endif
|
---|
94 | // A curve was available -- use fast traversal.
|
---|
95 | array.evaluateWithFastTraversal(*order, expr, T_update());
|
---|
96 | return _bz_true;
|
---|
97 | }
|
---|
98 |
|
---|
99 | return _bz_false;
|
---|
100 | }
|
---|
101 | };
|
---|
102 |
|
---|
103 | #endif // BZ_HAVE_STD
|
---|
104 |
|
---|
105 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
106 | inline Array<T_numtype, N_rank>&
|
---|
107 | Array<T_numtype, N_rank>::evaluate(_bz_ArrayExpr<T_expr> expr, T_update)
|
---|
108 | {
|
---|
109 | // Check that all arrays have the same shape
|
---|
110 | #ifdef BZ_DEBUG
|
---|
111 | if (!expr.shapeCheck(shape()))
|
---|
112 | {
|
---|
113 | if (assertFailMode == _bz_false)
|
---|
114 | {
|
---|
115 | cerr << "[Blitz++] Shape check failed: Module " << __FILE__
|
---|
116 | << " line " << __LINE__ << endl
|
---|
117 | << " Expression: ";
|
---|
118 | prettyPrintFormat format(_bz_true); // Use terse formatting
|
---|
119 | string str;
|
---|
120 | expr.prettyPrint(str, format);
|
---|
121 | cerr << str << endl ;
|
---|
122 | }
|
---|
123 |
|
---|
124 | #if 0
|
---|
125 | // Shape dumping is broken by change to using string for prettyPrint
|
---|
126 | << " Shapes: " << shape() << " = ";
|
---|
127 | prettyPrintFormat format2;
|
---|
128 | format2.setDumpArrayShapesMode();
|
---|
129 | expr.prettyPrint(cerr, format2);
|
---|
130 | cerr << endl;
|
---|
131 | #endif
|
---|
132 | BZ_PRE_FAIL;
|
---|
133 | }
|
---|
134 | #endif
|
---|
135 |
|
---|
136 | BZPRECHECK(expr.shapeCheck(shape()),
|
---|
137 | "Shape check failed." << endl << "Expression:");
|
---|
138 |
|
---|
139 | BZPRECHECK((T_expr::rank == N_rank) || (T_expr::numArrayOperands == 0),
|
---|
140 | "Assigned rank " << T_expr::rank << " expression to rank "
|
---|
141 | << N_rank << " array.");
|
---|
142 |
|
---|
143 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
144 | cout << "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders
|
---|
145 | << endl; cout.flush();
|
---|
146 | #endif
|
---|
147 |
|
---|
148 | // Tau profiling code. Provide Tau with a pretty-printed version of
|
---|
149 | // the expression.
|
---|
150 | // NEEDS_WORK-- use a static initializer somehow.
|
---|
151 |
|
---|
152 | #ifdef BZ_TAU_PROFILING
|
---|
153 | static string exprDescription;
|
---|
154 | if (!exprDescription.length()) // faked static initializer
|
---|
155 | {
|
---|
156 | exprDescription = "A";
|
---|
157 | prettyPrintFormat format(_bz_true); // Terse mode on
|
---|
158 | format.nextArrayOperandSymbol();
|
---|
159 | T_update::prettyPrint(exprDescription);
|
---|
160 | expr.prettyPrint(exprDescription, format);
|
---|
161 | }
|
---|
162 | TAU_PROFILE(" ", exprDescription, TAU_BLITZ);
|
---|
163 | #endif
|
---|
164 |
|
---|
165 | // Determine which evaluation mechanism to use
|
---|
166 | if (T_expr::numIndexPlaceholders > 0)
|
---|
167 | {
|
---|
168 | // The expression involves index placeholders, so have to
|
---|
169 | // use index traversal rather than stack traversal.
|
---|
170 |
|
---|
171 | if (N_rank == 1)
|
---|
172 | return evaluateWithIndexTraversal1(expr, T_update());
|
---|
173 | else
|
---|
174 | return evaluateWithIndexTraversalN(expr, T_update());
|
---|
175 | }
|
---|
176 | else {
|
---|
177 |
|
---|
178 | // If this expression looks like an array stencil, then attempt to
|
---|
179 | // use a fast traversal order.
|
---|
180 | // Fast traversals require <set> from the ISO/ANSI C++ standard
|
---|
181 | // library.
|
---|
182 |
|
---|
183 | #ifdef BZ_HAVE_STD
|
---|
184 |
|
---|
185 | enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)
|
---|
186 | && (T_expr::numIndexPlaceholders == 0) };
|
---|
187 |
|
---|
188 | if (_bz_tryFastTraversal<isStencil>::tryFast(*this, expr, T_update()))
|
---|
189 | return *this;
|
---|
190 |
|
---|
191 | #endif
|
---|
192 |
|
---|
193 | #ifdef BZ_ARRAY_2D_STENCIL_TILING
|
---|
194 | // Does this look like a 2-dimensional stencil on a largeish
|
---|
195 | // array?
|
---|
196 |
|
---|
197 | if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))
|
---|
198 | {
|
---|
199 | // Use a heuristic to determine whether a tiled traversal
|
---|
200 | // is desirable. First, estimate how much L1 cache is needed
|
---|
201 | // to achieve a high hit rate using the stack traversal.
|
---|
202 | // Try to err on the side of using tiled traversal even when
|
---|
203 | // it isn't strictly needed.
|
---|
204 |
|
---|
205 | // Assumptions:
|
---|
206 | // Stencil width 3
|
---|
207 | // 3 arrays involved in stencil
|
---|
208 | // Uniform data type in arrays (all T_numtype)
|
---|
209 |
|
---|
210 | int cacheNeeded = 3 * 3 * sizeof(T_numtype) * length(ordering(0));
|
---|
211 | if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE)
|
---|
212 | return evaluateWithTiled2DTraversal(expr, T_update());
|
---|
213 | }
|
---|
214 |
|
---|
215 | #endif
|
---|
216 |
|
---|
217 | // If fast traversal isn't available or appropriate, then just
|
---|
218 | // do a stack traversal.
|
---|
219 | if (N_rank == 1)
|
---|
220 | return evaluateWithStackTraversal1(expr, T_update());
|
---|
221 | else
|
---|
222 | return evaluateWithStackTraversalN(expr, T_update());
|
---|
223 | }
|
---|
224 | }
|
---|
225 |
|
---|
226 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
227 | inline Array<T_numtype, N_rank>&
|
---|
228 | Array<T_numtype, N_rank>::evaluateWithStackTraversal1(_bz_ArrayExpr<T_expr>
|
---|
229 | expr, T_update)
|
---|
230 | {
|
---|
231 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
232 | BZ_DEBUG_MESSAGE("Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
|
---|
233 | << ", " << N_rank << ">: Using stack traversal");
|
---|
234 | #endif
|
---|
235 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
236 | iter.loadStride(firstRank);
|
---|
237 | expr.loadStride(firstRank);
|
---|
238 |
|
---|
239 | _bz_bool useUnitStride = iter.isUnitStride(firstRank)
|
---|
240 | && expr.isUnitStride(firstRank);
|
---|
241 |
|
---|
242 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
243 | int commonStride = expr.suggestStride(firstRank);
|
---|
244 | if (iter.suggestStride(firstRank) > commonStride)
|
---|
245 | commonStride = iter.suggestStride(firstRank);
|
---|
246 | bool useCommonStride = iter.isStride(firstRank,commonStride)
|
---|
247 | && expr.isStride(firstRank,commonStride);
|
---|
248 |
|
---|
249 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
250 | BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE:" << endl
|
---|
251 | << " commonStride = " << commonStride << " useCommonStride = "
|
---|
252 | << useCommonStride);
|
---|
253 | #endif
|
---|
254 | #else
|
---|
255 | int commonStride = 1;
|
---|
256 | bool useCommonStride = _bz_false;
|
---|
257 | #endif
|
---|
258 |
|
---|
259 | const T_numtype * last = iter.data() + length(firstRank)
|
---|
260 | * stride(firstRank);
|
---|
261 |
|
---|
262 | if (useUnitStride || useCommonStride)
|
---|
263 | {
|
---|
264 | #ifdef BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
265 |
|
---|
266 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
267 | BZ_DEBUG_MESSAGE("BZ_USE_FAST_READ_ARRAY_EXPR with commonStride");
|
---|
268 | #endif
|
---|
269 | int ubound = length(firstRank) * commonStride;
|
---|
270 | T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
|
---|
271 |
|
---|
272 | if (commonStride == 1)
|
---|
273 | {
|
---|
274 | #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
|
---|
275 | for (int i=0; i < ubound; ++i)
|
---|
276 | T_update::update(data[i], expr.fastRead(i));
|
---|
277 | #else
|
---|
278 | int n1 = ubound & 3;
|
---|
279 | int i = 0;
|
---|
280 | for (; i < n1; ++i)
|
---|
281 | T_update::update(data[i], expr.fastRead(i));
|
---|
282 |
|
---|
283 | for (; i < ubound; i += 4)
|
---|
284 | {
|
---|
285 | #ifndef BZ_ARRAY_STACK_TRAVERSAL_CSE_AND_ANTIALIAS
|
---|
286 | T_update::update(data[i], expr.fastRead(i));
|
---|
287 | T_update::update(data[i+1], expr.fastRead(i+1));
|
---|
288 | T_update::update(data[i+2], expr.fastRead(i+2));
|
---|
289 | T_update::update(data[i+3], expr.fastRead(i+3));
|
---|
290 | #else
|
---|
291 | int t1 = i+1;
|
---|
292 | int t2 = i+2;
|
---|
293 | int t3 = i+3;
|
---|
294 |
|
---|
295 | _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3, tmp4;
|
---|
296 |
|
---|
297 | tmp1 = expr.fastRead(i);
|
---|
298 | tmp2 = expr.fastRead(BZ_NO_PROPAGATE(t1));
|
---|
299 | tmp3 = expr.fastRead(BZ_NO_PROPAGATE(t2));
|
---|
300 | tmp4 = expr.fastRead(BZ_NO_PROPAGATE(t3));
|
---|
301 |
|
---|
302 | T_update::update(data[i], BZ_NO_PROPAGATE(tmp1));
|
---|
303 | T_update::update(data[BZ_NO_PROPAGATE(t1)], tmp2);
|
---|
304 | T_update::update(data[BZ_NO_PROPAGATE(t2)], tmp3);
|
---|
305 | T_update::update(data[BZ_NO_PROPAGATE(t3)], tmp4);
|
---|
306 | #endif
|
---|
307 | }
|
---|
308 | #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
|
---|
309 |
|
---|
310 | }
|
---|
311 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
312 | else {
|
---|
313 |
|
---|
314 | #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
|
---|
315 | for (int i=0; i < ubound; i += commonStride)
|
---|
316 | T_update::update(data[i], expr.fastRead(i));
|
---|
317 | #else
|
---|
318 | int n1 = (length(firstRank) & 3) * commonStride;
|
---|
319 |
|
---|
320 | int i = 0;
|
---|
321 | for (; i < n1; i += commonStride)
|
---|
322 | T_update::update(data[i], expr.fastRead(i));
|
---|
323 |
|
---|
324 | int strideInc = 4 * commonStride;
|
---|
325 | for (; i < ubound; i += strideInc)
|
---|
326 | {
|
---|
327 | T_update::update(data[i], expr.fastRead(i));
|
---|
328 | int i2 = i + commonStride;
|
---|
329 | T_update::update(data[i2], expr.fastRead(i2));
|
---|
330 | int i3 = i + 2 * commonStride;
|
---|
331 | T_update::update(data[i3], expr.fastRead(i3));
|
---|
332 | int i4 = i + 3 * commonStride;
|
---|
333 | T_update::update(data[i4], expr.fastRead(i4));
|
---|
334 | }
|
---|
335 | #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
|
---|
336 | }
|
---|
337 | #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
338 |
|
---|
339 | #else // ! BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
340 |
|
---|
341 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
342 | BZ_DEBUG_MESSAGE("Common stride, no fast read");
|
---|
343 | #endif
|
---|
344 | while (iter.data() != last)
|
---|
345 | {
|
---|
346 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
347 | iter.advance(commonStride);
|
---|
348 | expr.advance(commonStride);
|
---|
349 | }
|
---|
350 | #endif
|
---|
351 | }
|
---|
352 | else {
|
---|
353 | while (iter.data() != last)
|
---|
354 | {
|
---|
355 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
356 | iter.advance();
|
---|
357 | expr.advance();
|
---|
358 | }
|
---|
359 | }
|
---|
360 |
|
---|
361 | return *this;
|
---|
362 | }
|
---|
363 |
|
---|
364 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
365 | inline Array<T_numtype, N_rank>&
|
---|
366 | Array<T_numtype, N_rank>::evaluateWithStackTraversalN(_bz_ArrayExpr<T_expr>
|
---|
367 | expr, T_update)
|
---|
368 | {
|
---|
369 | /*
|
---|
370 | * A stack traversal replaces the usual nested loops:
|
---|
371 | *
|
---|
372 | * for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
|
---|
373 | * for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
|
---|
374 | * for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
|
---|
375 | * A(i,j,k) = 0;
|
---|
376 | *
|
---|
377 | * with a stack data structure. The stack allows this single
|
---|
378 | * routine to replace any number of nested loops.
|
---|
379 | *
|
---|
380 | * For each dimension (loop), these quantities are needed:
|
---|
381 | * - a pointer to the first element encountered in the loop
|
---|
382 | * - the stride associated with the dimension/loop
|
---|
383 | * - a pointer to the last element encountered in the loop
|
---|
384 | *
|
---|
385 | * The basic idea is that entering each loop is a "push" onto the
|
---|
386 | * stack, and exiting each loop is a "pop". In practice, this
|
---|
387 | * routine treats accesses the stack in a random-access way,
|
---|
388 | * which confuses the picture a bit. But conceptually, that's
|
---|
389 | * what is going on.
|
---|
390 | */
|
---|
391 |
|
---|
392 | /*
|
---|
393 | * ordering(0) gives the dimension associated with the smallest
|
---|
394 | * stride (usually; the exceptions have to do with subarrays and
|
---|
395 | * are uninteresting). We call this dimension maxRank; it will
|
---|
396 | * become the innermost "loop".
|
---|
397 | *
|
---|
398 | * Ordering the loops from ordering(N_rank-1) down to
|
---|
399 | * ordering(0) ensures that the largest stride is associated
|
---|
400 | * with the outermost loop, and the smallest stride with the
|
---|
401 | * innermost. This is critical for good performance on
|
---|
402 | * cached machines.
|
---|
403 | */
|
---|
404 | const int maxRank = ordering(0);
|
---|
405 | const int secondLastRank = ordering(1);
|
---|
406 |
|
---|
407 | // Create an iterator for the array receiving the result
|
---|
408 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
409 |
|
---|
410 | // Set the initial stack configuration by pushing the pointer
|
---|
411 | // to the first element of the array onto the stack N times.
|
---|
412 |
|
---|
413 | int i;
|
---|
414 | for (i=1; i < N_rank; ++i)
|
---|
415 | {
|
---|
416 | iter.push(i);
|
---|
417 | expr.push(i);
|
---|
418 | }
|
---|
419 |
|
---|
420 | // Load the strides associated with the innermost loop.
|
---|
421 | iter.loadStride(maxRank);
|
---|
422 | expr.loadStride(maxRank);
|
---|
423 |
|
---|
424 | /*
|
---|
425 | * Is the stride in the innermost loop equal to 1? If so,
|
---|
426 | * we might take advantage of this and generate more
|
---|
427 | * efficient code.
|
---|
428 | */
|
---|
429 | _bz_bool useUnitStride = iter.isUnitStride(maxRank)
|
---|
430 | && expr.isUnitStride(maxRank);
|
---|
431 |
|
---|
432 | /*
|
---|
433 | * Do all array operands share a common stride in the innermost
|
---|
434 | * loop? If so, we can generate more efficient code (but only
|
---|
435 | * if this optimization has been enabled).
|
---|
436 | */
|
---|
437 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
438 | int commonStride = expr.suggestStride(maxRank);
|
---|
439 | if (iter.suggestStride(maxRank) > commonStride)
|
---|
440 | commonStride = iter.suggestStride(maxRank);
|
---|
441 | bool useCommonStride = iter.isStride(maxRank,commonStride)
|
---|
442 | && expr.isStride(maxRank,commonStride);
|
---|
443 |
|
---|
444 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
445 | BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl
|
---|
446 | << "commonStride = " << commonStride << " useCommonStride = "
|
---|
447 | << useCommonStride);
|
---|
448 | #endif
|
---|
449 |
|
---|
450 | #else
|
---|
451 | int commonStride = 1;
|
---|
452 | bool useCommonStride = _bz_false;
|
---|
453 | #endif
|
---|
454 |
|
---|
455 | /*
|
---|
456 | * The "last" array contains a pointer to the last element
|
---|
457 | * encountered in each "loop".
|
---|
458 | */
|
---|
459 | const T_numtype* _bz_restrict last[N_rank];
|
---|
460 |
|
---|
461 | // Set up the initial state of the "last" array
|
---|
462 | for (i=1; i < N_rank; ++i)
|
---|
463 | last[i] = iter.data() + length(ordering(i)) * stride(ordering(i));
|
---|
464 |
|
---|
465 | int lastLength = length(maxRank);
|
---|
466 | int firstNoncollapsedLoop = 1;
|
---|
467 |
|
---|
468 | #ifdef BZ_COLLAPSE_LOOPS
|
---|
469 |
|
---|
470 | /*
|
---|
471 | * This bit of code handles collapsing loops. When possible,
|
---|
472 | * the N nested loops are converted into a single loop (basically,
|
---|
473 | * the N-dimensional array is treated as a long vector).
|
---|
474 | * This is important for cases where the length of the innermost
|
---|
475 | * loop is very small, for example a 100x100x3 array.
|
---|
476 | * If this code can't collapse all the loops into a single loop,
|
---|
477 | * it will collapse as many loops as possible starting from the
|
---|
478 | * innermost and working out.
|
---|
479 | */
|
---|
480 |
|
---|
481 | // Collapse loops when possible
|
---|
482 | for (i=1; i < N_rank; ++i)
|
---|
483 | {
|
---|
484 | // Figure out which pair of loops we are considering combining.
|
---|
485 | int outerLoopRank = ordering(i);
|
---|
486 | int innerLoopRank = ordering(i-1);
|
---|
487 |
|
---|
488 | /*
|
---|
489 | * The canCollapse() routines look at the strides and extents
|
---|
490 | * of the loops, and determine if they can be combined into
|
---|
491 | * one loop.
|
---|
492 | */
|
---|
493 |
|
---|
494 | if (canCollapse(outerLoopRank,innerLoopRank)
|
---|
495 | && expr.canCollapse(outerLoopRank,innerLoopRank))
|
---|
496 | {
|
---|
497 | lastLength *= length(outerLoopRank);
|
---|
498 | firstNoncollapsedLoop = i+1;
|
---|
499 | }
|
---|
500 | }
|
---|
501 | #endif // BZ_COLLAPSE_LOOPS
|
---|
502 |
|
---|
503 | /*
|
---|
504 | * Now we actually perform the loops. This while loop contains
|
---|
505 | * two parts: first, the innermost loop is performed. Then we
|
---|
506 | * exit the loop, and pop our way down the stack until we find
|
---|
507 | * a loop that isn't completed. We then restart the inner loops
|
---|
508 | * and push them onto the stack.
|
---|
509 | */
|
---|
510 |
|
---|
511 | while (true) {
|
---|
512 |
|
---|
513 | /*
|
---|
514 | * This bit of code handles the innermost loop. It would look
|
---|
515 | * a lot simpler if it weren't for unit stride and common stride
|
---|
516 | * optimizations; these clutter up the code with multiple versions.
|
---|
517 | */
|
---|
518 |
|
---|
519 | if ((useUnitStride) || (useCommonStride))
|
---|
520 | {
|
---|
521 | T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
|
---|
522 | + lastLength;
|
---|
523 |
|
---|
524 | #ifdef BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
525 |
|
---|
526 | /*
|
---|
527 | * The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably
|
---|
528 | * be taken out. This was put in place while the unit stride/
|
---|
529 | * common stride optimizations were being implemented and
|
---|
530 | * tested.
|
---|
531 | */
|
---|
532 |
|
---|
533 | // Calculate the end of the innermost loop
|
---|
534 | int ubound = lastLength * commonStride;
|
---|
535 |
|
---|
536 | /*
|
---|
537 | * This is a real kludge. I didn't want to have to write
|
---|
538 | * a const and non-const version of ArrayIterator, so I use a
|
---|
539 | * const iterator and cast away const. This could
|
---|
540 | * probably be avoided with some trick, but the whole routine
|
---|
541 | * is ugly, so why bother.
|
---|
542 | */
|
---|
543 |
|
---|
544 | T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
|
---|
545 |
|
---|
546 | /*
|
---|
547 | * BZ_NEEDS_WORK-- need to implement optional unrolling.
|
---|
548 | */
|
---|
549 | if (commonStride == 1)
|
---|
550 | {
|
---|
551 | for (int i=0; i < ubound; ++i)
|
---|
552 | T_update::update(data[i], expr.fastRead(i));
|
---|
553 | }
|
---|
554 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
555 | else {
|
---|
556 | for (int i=0; i < ubound; i += commonStride)
|
---|
557 | T_update::update(data[i], expr.fastRead(i));
|
---|
558 | }
|
---|
559 | #endif
|
---|
560 | /*
|
---|
561 | * Tidy up for the fact that we haven't actually been
|
---|
562 | * incrementing the iterators in the innermost loop, by
|
---|
563 | * faking it afterward.
|
---|
564 | */
|
---|
565 | iter.advance(lastLength * commonStride);
|
---|
566 | expr.advance(lastLength * commonStride);
|
---|
567 | #else
|
---|
568 | // !BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
569 | // This bit of code not really needed; should remove at some
|
---|
570 | // point, along with the test for BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
571 |
|
---|
572 | while (iter.data() != end)
|
---|
573 | {
|
---|
574 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
575 | iter.advance(commonStride);
|
---|
576 | expr.advance(commonStride);
|
---|
577 | }
|
---|
578 | #endif
|
---|
579 | }
|
---|
580 | else {
|
---|
581 | /*
|
---|
582 | * We don't have a unit stride or common stride in the innermost
|
---|
583 | * loop. This is going to hurt performance. Luckily 95% of
|
---|
584 | * the time, we hit the cases above.
|
---|
585 | */
|
---|
586 | T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
|
---|
587 | + lastLength * stride(maxRank);
|
---|
588 |
|
---|
589 | while (iter.data() != end)
|
---|
590 | {
|
---|
591 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
592 | iter.advance();
|
---|
593 | expr.advance();
|
---|
594 | }
|
---|
595 | }
|
---|
596 |
|
---|
597 |
|
---|
598 | /*
|
---|
599 | * We just finished the innermost loop. Now we pop our way down
|
---|
600 | * the stack, until we hit a loop that hasn't completed yet.
|
---|
601 | */
|
---|
602 | int j = firstNoncollapsedLoop;
|
---|
603 | for (; j < N_rank; ++j)
|
---|
604 | {
|
---|
605 | // Get the next loop
|
---|
606 | int r = ordering(j);
|
---|
607 |
|
---|
608 | // Pop-- this restores the data pointers to the first element
|
---|
609 | // encountered in the loop.
|
---|
610 | iter.pop(j);
|
---|
611 | expr.pop(j);
|
---|
612 |
|
---|
613 | // Load the stride associated with this loop, and increment
|
---|
614 | // once.
|
---|
615 | iter.loadStride(r);
|
---|
616 | expr.loadStride(r);
|
---|
617 | iter.advance();
|
---|
618 | expr.advance();
|
---|
619 |
|
---|
620 | // If we aren't at the end of this loop, then stop popping.
|
---|
621 | if (iter.data() != last[j])
|
---|
622 | break;
|
---|
623 | }
|
---|
624 |
|
---|
625 | // Are we completely done?
|
---|
626 | if (j == N_rank)
|
---|
627 | break;
|
---|
628 |
|
---|
629 | // No, so push all the inner loops back onto the stack.
|
---|
630 | for (; j >= firstNoncollapsedLoop; --j)
|
---|
631 | {
|
---|
632 | int r2 = ordering(j-1);
|
---|
633 | iter.push(j);
|
---|
634 | expr.push(j);
|
---|
635 | last[j-1] = iter.data() + length(r2) * stride(r2);
|
---|
636 | }
|
---|
637 |
|
---|
638 | // Load the stride for the innermost loop again.
|
---|
639 | iter.loadStride(maxRank);
|
---|
640 | expr.loadStride(maxRank);
|
---|
641 | }
|
---|
642 |
|
---|
643 | return *this;
|
---|
644 | }
|
---|
645 |
|
---|
646 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
647 | inline Array<T_numtype, N_rank>&
|
---|
648 | Array<T_numtype, N_rank>::evaluateWithIndexTraversal1(_bz_ArrayExpr<T_expr>
|
---|
649 | expr, T_update)
|
---|
650 | {
|
---|
651 | TinyVector<int,N_rank> index;
|
---|
652 |
|
---|
653 | if (stride(firstRank) == 1)
|
---|
654 | {
|
---|
655 | T_numtype * _bz_restrict iter = data_;
|
---|
656 | int last = ubound(firstRank);
|
---|
657 |
|
---|
658 | for (index[0] = lbound(firstRank); index[0] <= last;
|
---|
659 | ++index[0])
|
---|
660 | {
|
---|
661 | iter[index[0]] = expr(index);
|
---|
662 | }
|
---|
663 | }
|
---|
664 | else {
|
---|
665 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
666 | iter.loadStride(0);
|
---|
667 | int last = ubound(firstRank);
|
---|
668 |
|
---|
669 | for (index[0] = lbound(firstRank); index[0] <= last;
|
---|
670 | ++index[0])
|
---|
671 | {
|
---|
672 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
673 | expr(index));
|
---|
674 | iter.advance();
|
---|
675 | }
|
---|
676 | }
|
---|
677 |
|
---|
678 | return *this;
|
---|
679 | }
|
---|
680 |
|
---|
681 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
682 | inline Array<T_numtype, N_rank>&
|
---|
683 | Array<T_numtype, N_rank>::evaluateWithIndexTraversalN(_bz_ArrayExpr<T_expr>
|
---|
684 | expr, T_update)
|
---|
685 | {
|
---|
686 | // Do a stack-type traversal for the destination array and use
|
---|
687 | // index traversal for the source expression
|
---|
688 |
|
---|
689 | const int maxRank = ordering(0);
|
---|
690 | const int secondLastRank = ordering(1);
|
---|
691 |
|
---|
692 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
693 | cout << "Index traversal: N_rank = " << N_rank << endl;
|
---|
694 | cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
|
---|
695 | << endl;
|
---|
696 | cout.flush();
|
---|
697 | #endif
|
---|
698 |
|
---|
699 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
700 | for (int i=1; i < N_rank; ++i)
|
---|
701 | iter.push(ordering(i));
|
---|
702 |
|
---|
703 | iter.loadStride(maxRank);
|
---|
704 |
|
---|
705 | TinyVector<int,N_rank> index, last;
|
---|
706 |
|
---|
707 | index = storage_.base();
|
---|
708 | last = storage_.base() + length_;
|
---|
709 |
|
---|
710 | int lastLength = length(maxRank);
|
---|
711 |
|
---|
712 | while (true) {
|
---|
713 |
|
---|
714 | for (index[maxRank] = base(maxRank);
|
---|
715 | index[maxRank] < last[maxRank];
|
---|
716 | ++index[maxRank])
|
---|
717 | {
|
---|
718 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
719 | #if 0
|
---|
720 | cout << "(" << index[0] << "," << index[1] << ") " << endl;
|
---|
721 | cout.flush();
|
---|
722 | #endif
|
---|
723 | #endif
|
---|
724 |
|
---|
725 | T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index));
|
---|
726 | iter.advance();
|
---|
727 | }
|
---|
728 |
|
---|
729 | int j = 1;
|
---|
730 | for (; j < N_rank; ++j)
|
---|
731 | {
|
---|
732 | iter.pop(ordering(j));
|
---|
733 | iter.loadStride(ordering(j));
|
---|
734 | iter.advance();
|
---|
735 |
|
---|
736 | index[ordering(j-1)] = base(ordering(j-1));
|
---|
737 | ++index[ordering(j)];
|
---|
738 | if (index[ordering(j)] != last[ordering(j)])
|
---|
739 | break;
|
---|
740 | }
|
---|
741 |
|
---|
742 | if (j == N_rank)
|
---|
743 | break;
|
---|
744 |
|
---|
745 | for (; j > 0; --j)
|
---|
746 | {
|
---|
747 | iter.push(ordering(j));
|
---|
748 | }
|
---|
749 | iter.loadStride(maxRank);
|
---|
750 | }
|
---|
751 |
|
---|
752 | return *this;
|
---|
753 | }
|
---|
754 |
|
---|
755 | // Fast traversals require <set> from the ISO/ANSI C++ standard library
|
---|
756 |
|
---|
757 | #ifdef BZ_HAVE_STD
|
---|
758 |
|
---|
759 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
760 | inline Array<T_numtype, N_rank>&
|
---|
761 | Array<T_numtype, N_rank>::evaluateWithFastTraversal(
|
---|
762 | const TraversalOrder<N_rank - 1>& order, _bz_ArrayExpr<T_expr> expr,
|
---|
763 | T_update)
|
---|
764 | {
|
---|
765 | const int maxRank = ordering(0);
|
---|
766 | const int secondLastRank = ordering(1);
|
---|
767 |
|
---|
768 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
769 | cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
|
---|
770 | << endl;
|
---|
771 | #endif
|
---|
772 |
|
---|
773 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
774 | iter.push(0);
|
---|
775 | expr.push(0);
|
---|
776 |
|
---|
777 | _bz_bool useUnitStride = iter.isUnitStride(maxRank)
|
---|
778 | && expr.isUnitStride(maxRank);
|
---|
779 |
|
---|
780 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
781 | int commonStride = expr.suggestStride(maxRank);
|
---|
782 | if (iter.suggestStride(maxRank) > commonStride)
|
---|
783 | commonStride = iter.suggestStride(maxRank);
|
---|
784 | bool useCommonStride = iter.isStride(maxRank,commonStride)
|
---|
785 | && expr.isStride(maxRank,commonStride);
|
---|
786 | #else
|
---|
787 | int commonStride = 1;
|
---|
788 | bool useCommonStride = _bz_false;
|
---|
789 | #endif
|
---|
790 |
|
---|
791 | int lastLength = length(maxRank);
|
---|
792 |
|
---|
793 | for (int i=0; i < order.length(); ++i)
|
---|
794 | {
|
---|
795 | iter.pop(0);
|
---|
796 | expr.pop(0);
|
---|
797 |
|
---|
798 | #ifdef BZ_DEBUG_TRAVERSE
|
---|
799 | cerr << "Traversing: " << order[i] << endl;
|
---|
800 | #endif
|
---|
801 | // Position the iterator at the start of the next column
|
---|
802 | for (int j=1; j < N_rank; ++j)
|
---|
803 | {
|
---|
804 | iter.loadStride(ordering(j));
|
---|
805 | expr.loadStride(ordering(j));
|
---|
806 |
|
---|
807 | int offset = order[i][j-1];
|
---|
808 | iter.advance(offset);
|
---|
809 | expr.advance(offset);
|
---|
810 | }
|
---|
811 |
|
---|
812 | iter.loadStride(maxRank);
|
---|
813 | expr.loadStride(maxRank);
|
---|
814 |
|
---|
815 | // Evaluate the expression along the column
|
---|
816 |
|
---|
817 | if ((useUnitStride) || (useCommonStride))
|
---|
818 | {
|
---|
819 | T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
|
---|
820 | + lastLength * commonStride;
|
---|
821 |
|
---|
822 | #ifdef BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
823 | int ubound = lastLength * commonStride;
|
---|
824 | T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
|
---|
825 |
|
---|
826 | if (commonStride == 1)
|
---|
827 | {
|
---|
828 | #ifndef BZ_ARRAY_FAST_TRAVERSAL_UNROLL
|
---|
829 | for (int i=0; i < ubound; ++i)
|
---|
830 | T_update::update(data[i], expr.fastRead(i));
|
---|
831 | #else
|
---|
832 | int n1 = ubound & 3;
|
---|
833 | int i=0;
|
---|
834 | for (; i < n1; ++i)
|
---|
835 | T_update::update(data[i], expr.fastRead(i));
|
---|
836 |
|
---|
837 | for (; i < ubound; i += 4)
|
---|
838 | {
|
---|
839 | T_update::update(data[i], expr.fastRead(i));
|
---|
840 | T_update::update(data[i+1], expr.fastRead(i+1));
|
---|
841 | T_update::update(data[i+2], expr.fastRead(i+2));
|
---|
842 | T_update::update(data[i+3], expr.fastRead(i+3));
|
---|
843 | }
|
---|
844 | #endif // BZ_ARRAY_FAST_TRAVERSAL_UNROLL
|
---|
845 | }
|
---|
846 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
847 | else {
|
---|
848 | for (int i=0; i < ubound; i += commonStride)
|
---|
849 | T_update::update(data[i], expr.fastRead(i));
|
---|
850 | }
|
---|
851 | #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
852 |
|
---|
853 | iter.advance(lastLength * commonStride);
|
---|
854 | expr.advance(lastLength * commonStride);
|
---|
855 | #else // ! BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
856 | while (iter.data() != last)
|
---|
857 | {
|
---|
858 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
859 | iter.advance(commonStride);
|
---|
860 | expr.advance(commonStride);
|
---|
861 | }
|
---|
862 | #endif // BZ_USE_FAST_READ_ARRAY_EXPR
|
---|
863 |
|
---|
864 | }
|
---|
865 | else {
|
---|
866 | // No common stride
|
---|
867 |
|
---|
868 | T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
|
---|
869 | + lastLength * stride(maxRank);
|
---|
870 |
|
---|
871 | while (iter.data() != last)
|
---|
872 | {
|
---|
873 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
|
---|
874 | iter.advance();
|
---|
875 | expr.advance();
|
---|
876 | }
|
---|
877 | }
|
---|
878 | }
|
---|
879 |
|
---|
880 | return *this;
|
---|
881 | }
|
---|
882 | #endif // BZ_HAVE_STD
|
---|
883 |
|
---|
884 | #ifdef BZ_ARRAY_2D_NEW_STENCIL_TILING
|
---|
885 |
|
---|
886 | #ifdef BZ_ARRAY_2D_STENCIL_TILING
|
---|
887 |
|
---|
888 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
889 | inline Array<T_numtype, N_rank>&
|
---|
890 | Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
|
---|
891 | expr, T_update)
|
---|
892 | {
|
---|
893 | const int minorRank = ordering(0);
|
---|
894 | const int majorRank = ordering(1);
|
---|
895 |
|
---|
896 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
897 | iter.push(0);
|
---|
898 | expr.push(0);
|
---|
899 |
|
---|
900 | #ifdef BZ_2D_STENCIL_DEBUG
|
---|
901 | int count = 0;
|
---|
902 | #endif
|
---|
903 |
|
---|
904 | _bz_bool useUnitStride = iter.isUnitStride(minorRank)
|
---|
905 | && expr.isUnitStride(minorRank);
|
---|
906 |
|
---|
907 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
908 | int commonStride = expr.suggestStride(minorRank);
|
---|
909 | if (iter.suggestStride(minorRank) > commonStride)
|
---|
910 | commonStride = iter.suggestStride(minorRank);
|
---|
911 | bool useCommonStride = iter.isStride(minorRank,commonStride)
|
---|
912 | && expr.isStride(minorRank,commonStride);
|
---|
913 | #else
|
---|
914 | int commonStride = 1;
|
---|
915 | bool useCommonStride = _bz_false;
|
---|
916 | #endif
|
---|
917 |
|
---|
918 | // Determine if a common major stride exists
|
---|
919 | int commonMajorStride = expr.suggestStride(majorRank);
|
---|
920 | if (iter.suggestStride(majorRank) > commonMajorStride)
|
---|
921 | commonMajorStride = iter.suggestStride(majorRank);
|
---|
922 | bool haveCommonMajorStride = iter.isStride(majorRank,commonMajorStride)
|
---|
923 | && expr.isStride(majorRank,commonMajorStride);
|
---|
924 |
|
---|
925 |
|
---|
926 | int maxi = length(majorRank);
|
---|
927 | int maxj = length(minorRank);
|
---|
928 |
|
---|
929 | const int tileHeight = 16, tileWidth = 3;
|
---|
930 |
|
---|
931 | int bi, bj;
|
---|
932 | for (bi=0; bi < maxi; bi += tileHeight)
|
---|
933 | {
|
---|
934 | int ni = bi + tileHeight;
|
---|
935 | if (ni > maxi)
|
---|
936 | ni = maxi;
|
---|
937 |
|
---|
938 | // Move back to the beginning of the array
|
---|
939 | iter.pop(0);
|
---|
940 | expr.pop(0);
|
---|
941 |
|
---|
942 | // Move to the start of this tile row
|
---|
943 | iter.loadStride(majorRank);
|
---|
944 | iter.advance(bi);
|
---|
945 | expr.loadStride(majorRank);
|
---|
946 | expr.advance(bi);
|
---|
947 |
|
---|
948 | // Save this position
|
---|
949 | iter.push(1);
|
---|
950 | expr.push(1);
|
---|
951 |
|
---|
952 | for (bj=0; bj < maxj; bj += tileWidth)
|
---|
953 | {
|
---|
954 | // Move to the beginning of the tile row
|
---|
955 | iter.pop(1);
|
---|
956 | expr.pop(1);
|
---|
957 |
|
---|
958 | // Move to the top of the current tile (bi,bj)
|
---|
959 | iter.loadStride(minorRank);
|
---|
960 | iter.advance(bj);
|
---|
961 | expr.loadStride(minorRank);
|
---|
962 | expr.advance(bj);
|
---|
963 |
|
---|
964 | if (bj + tileWidth <= maxj)
|
---|
965 | {
|
---|
966 | // Strip mining
|
---|
967 |
|
---|
968 | if ((useUnitStride) && (haveCommonMajorStride))
|
---|
969 | {
|
---|
970 | int offset = 0;
|
---|
971 | T_numtype* _bz_restrict data = const_cast<T_numtype*>
|
---|
972 | (iter.data());
|
---|
973 |
|
---|
974 | for (int i=bi; i < ni; ++i)
|
---|
975 | {
|
---|
976 | _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3;
|
---|
977 |
|
---|
978 | // Common subexpression elimination -- compilers
|
---|
979 | // won't necessarily do this on their own.
|
---|
980 | int t1 = offset+1;
|
---|
981 | int t2 = offset+2;
|
---|
982 |
|
---|
983 | tmp1 = expr.fastRead(offset);
|
---|
984 | tmp2 = expr.fastRead(t1);
|
---|
985 | tmp3 = expr.fastRead(t2);
|
---|
986 |
|
---|
987 | T_update::update(data[0], tmp1);
|
---|
988 | T_update::update(data[1], tmp2);
|
---|
989 | T_update::update(data[2], tmp3);
|
---|
990 |
|
---|
991 | offset += commonMajorStride;
|
---|
992 | data += commonMajorStride;
|
---|
993 |
|
---|
994 | #ifdef BZ_2D_STENCIL_DEBUG
|
---|
995 | count += 3;
|
---|
996 | #endif
|
---|
997 | }
|
---|
998 | }
|
---|
999 | else {
|
---|
1000 |
|
---|
1001 | for (int i=bi; i < ni; ++i)
|
---|
1002 | {
|
---|
1003 | iter.loadStride(minorRank);
|
---|
1004 | expr.loadStride(minorRank);
|
---|
1005 |
|
---|
1006 | // Loop through current row elements
|
---|
1007 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
1008 | *expr);
|
---|
1009 | iter.advance();
|
---|
1010 | expr.advance();
|
---|
1011 |
|
---|
1012 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
1013 | *expr);
|
---|
1014 | iter.advance();
|
---|
1015 | expr.advance();
|
---|
1016 |
|
---|
1017 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
1018 | *expr);
|
---|
1019 | iter.advance(-2);
|
---|
1020 | expr.advance(-2);
|
---|
1021 |
|
---|
1022 | iter.loadStride(majorRank);
|
---|
1023 | expr.loadStride(majorRank);
|
---|
1024 | iter.advance();
|
---|
1025 | expr.advance();
|
---|
1026 |
|
---|
1027 | #ifdef BZ_2D_STENCIL_DEBUG
|
---|
1028 | count += 3;
|
---|
1029 | #endif
|
---|
1030 |
|
---|
1031 | }
|
---|
1032 | }
|
---|
1033 | }
|
---|
1034 | else {
|
---|
1035 |
|
---|
1036 | // This code handles partial tiles at the bottom of the
|
---|
1037 | // array.
|
---|
1038 |
|
---|
1039 | for (int j=bj; j < maxj; ++j)
|
---|
1040 | {
|
---|
1041 | iter.loadStride(majorRank);
|
---|
1042 | expr.loadStride(majorRank);
|
---|
1043 |
|
---|
1044 | for (int i=bi; i < ni; ++i)
|
---|
1045 | {
|
---|
1046 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
1047 | *expr);
|
---|
1048 | iter.advance();
|
---|
1049 | expr.advance();
|
---|
1050 | #ifdef BZ_2D_STENCIL_DEBUG
|
---|
1051 | ++count;
|
---|
1052 | #endif
|
---|
1053 |
|
---|
1054 | }
|
---|
1055 |
|
---|
1056 | // Move back to the top of this column
|
---|
1057 | iter.advance(bi-ni);
|
---|
1058 | expr.advance(bi-ni);
|
---|
1059 |
|
---|
1060 | // Move over to the next column
|
---|
1061 | iter.loadStride(minorRank);
|
---|
1062 | expr.loadStride(minorRank);
|
---|
1063 |
|
---|
1064 | iter.advance();
|
---|
1065 | expr.advance();
|
---|
1066 | }
|
---|
1067 | }
|
---|
1068 | }
|
---|
1069 | }
|
---|
1070 |
|
---|
1071 | #ifdef BZ_2D_STENCIL_DEBUG
|
---|
1072 | cout << "BZ_2D_STENCIL_DEBUG: count = " << count << endl;
|
---|
1073 | #endif
|
---|
1074 |
|
---|
1075 | return *this;
|
---|
1076 | }
|
---|
1077 |
|
---|
1078 | #endif // BZ_ARRAY_2D_STENCIL_TILING
|
---|
1079 | #endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
|
---|
1080 |
|
---|
1081 |
|
---|
1082 |
|
---|
1083 | #ifndef BZ_ARRAY_2D_NEW_STENCIL_TILING
|
---|
1084 |
|
---|
1085 | #ifdef BZ_ARRAY_2D_STENCIL_TILING
|
---|
1086 |
|
---|
1087 | template<class T_numtype, int N_rank> template<class T_expr, class T_update>
|
---|
1088 | inline Array<T_numtype, N_rank>&
|
---|
1089 | Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
|
---|
1090 | expr, T_update)
|
---|
1091 | {
|
---|
1092 | const int minorRank = ordering(0);
|
---|
1093 | const int majorRank = ordering(1);
|
---|
1094 |
|
---|
1095 | const int blockSize = 16;
|
---|
1096 |
|
---|
1097 | ArrayIterator<T_numtype, N_rank> iter(*this);
|
---|
1098 | iter.push(0);
|
---|
1099 | expr.push(0);
|
---|
1100 |
|
---|
1101 | _bz_bool useUnitStride = iter.isUnitStride(minorRank)
|
---|
1102 | && expr.isUnitStride(minorRank);
|
---|
1103 |
|
---|
1104 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
1105 | int commonStride = expr.suggestStride(minorRank);
|
---|
1106 | if (iter.suggestStride(minorRank) > commonStride)
|
---|
1107 | commonStride = iter.suggestStride(minorRank);
|
---|
1108 | bool useCommonStride = iter.isStride(minorRank,commonStride)
|
---|
1109 | && expr.isStride(minorRank,commonStride);
|
---|
1110 | #else
|
---|
1111 | int commonStride = 1;
|
---|
1112 | bool useCommonStride = _bz_false;
|
---|
1113 | #endif
|
---|
1114 |
|
---|
1115 | int maxi = length(majorRank);
|
---|
1116 | int maxj = length(minorRank);
|
---|
1117 |
|
---|
1118 | int bi, bj;
|
---|
1119 | for (bi=0; bi < maxi; bi += blockSize)
|
---|
1120 | {
|
---|
1121 | int ni = bi + blockSize;
|
---|
1122 | if (ni > maxi)
|
---|
1123 | ni = maxi;
|
---|
1124 |
|
---|
1125 | for (bj=0; bj < maxj; bj += blockSize)
|
---|
1126 | {
|
---|
1127 | int nj = bj + blockSize;
|
---|
1128 | if (nj > maxj)
|
---|
1129 | nj = maxj;
|
---|
1130 |
|
---|
1131 | // Move to the beginning of the array
|
---|
1132 | iter.pop(0);
|
---|
1133 | expr.pop(0);
|
---|
1134 |
|
---|
1135 | // Move to the beginning of the tile (bi,bj)
|
---|
1136 | iter.loadStride(majorRank);
|
---|
1137 | iter.advance(bi);
|
---|
1138 | iter.loadStride(minorRank);
|
---|
1139 | iter.advance(bj);
|
---|
1140 |
|
---|
1141 | expr.loadStride(majorRank);
|
---|
1142 | expr.advance(bi);
|
---|
1143 | expr.loadStride(minorRank);
|
---|
1144 | expr.advance(bj);
|
---|
1145 |
|
---|
1146 | // Loop through tile rows
|
---|
1147 | for (int i=bi; i < ni; ++i)
|
---|
1148 | {
|
---|
1149 | // Save the beginning of this tile row
|
---|
1150 | iter.push(1);
|
---|
1151 | expr.push(1);
|
---|
1152 |
|
---|
1153 | // Load the minor stride
|
---|
1154 | iter.loadStride(minorRank);
|
---|
1155 | expr.loadStride(minorRank);
|
---|
1156 |
|
---|
1157 | if (useUnitStride)
|
---|
1158 | {
|
---|
1159 | T_numtype* _bz_restrict data = const_cast<T_numtype*>
|
---|
1160 | (iter.data());
|
---|
1161 |
|
---|
1162 | int ubound = (nj-bj);
|
---|
1163 | for (int j=0; j < ubound; ++j)
|
---|
1164 | T_update::update(data[j], expr.fastRead(j));
|
---|
1165 | }
|
---|
1166 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
|
---|
1167 | else if (useCommonStride)
|
---|
1168 | {
|
---|
1169 | int ubound = (nj-bj) * commonStride;
|
---|
1170 | T_numtype* _bz_restrict data = const_cast<T_numtype*>
|
---|
1171 | (iter.data());
|
---|
1172 |
|
---|
1173 | for (int j=0; j < ubound; j += commonStride)
|
---|
1174 | T_update::update(data[j], expr.fastRead(j));
|
---|
1175 | }
|
---|
1176 | #endif
|
---|
1177 | else {
|
---|
1178 | for (int j=bj; j < nj; ++j)
|
---|
1179 | {
|
---|
1180 | // Loop through current row elements
|
---|
1181 | T_update::update(*const_cast<T_numtype*>(iter.data()),
|
---|
1182 | *expr);
|
---|
1183 | iter.advance();
|
---|
1184 | expr.advance();
|
---|
1185 | }
|
---|
1186 | }
|
---|
1187 |
|
---|
1188 | // Move back to the beginning of the tile row, then
|
---|
1189 | // move to the next row
|
---|
1190 | iter.pop(1);
|
---|
1191 | iter.loadStride(majorRank);
|
---|
1192 | iter.advance(1);
|
---|
1193 |
|
---|
1194 | expr.pop(1);
|
---|
1195 | expr.loadStride(majorRank);
|
---|
1196 | expr.advance(1);
|
---|
1197 | }
|
---|
1198 | }
|
---|
1199 | }
|
---|
1200 |
|
---|
1201 | return *this;
|
---|
1202 | }
|
---|
1203 | #endif // BZ_ARRAY_2D_STENCIL_TILING
|
---|
1204 | #endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
|
---|
1205 |
|
---|
1206 | BZ_NAMESPACE_END
|
---|
1207 |
|
---|
1208 | #endif // BZ_ARRAYEVAL_CC
|
---|
1209 |
|
---|