source: Sophya/trunk/SophyaExt/Blitz/blitz/array/eval.cc@ 2576

Last change on this file since 2576 was 221, checked in by ansari, 27 years ago

Creation module DPC/Blitz (blitz 0.4) Reza 09/04/99

File size: 37.6 KB
RevLine 
[221]1/*
2 * $Id: eval.cc,v 1.1.1.1 1999-04-09 17:59:03 ansari Exp $
3 *
4 * $Log: not supported by cvs2svn $
5 * Revision 1.2 1998/03/14 00:04:47 tveldhui
6 * 0.2-alpha-05
7 *
8 * Revision 1.1 1998/02/25 20:04:01 tveldhui
9 * Initial revision
10 *
11 */
12
13#ifndef BZ_ARRAYEVAL_CC
14#define BZ_ARRAYEVAL_CC
15
16#ifndef BZ_ARRAY_H
17 #error <blitz/array/eval.cc> must be included via <blitz/array.h>
18#endif
19
20BZ_NAMESPACE(blitz)
21
22/*
23 * Assign an expression to an array. For performance reasons, there are
24 * several traversal mechanisms:
25 *
26 * - Index traversal scans through the destination array in storage order.
27 * The expression is evaluated using a TinyVector<int,N> operand. This
28 * version is used only when there are index placeholders in the expression
29 * (see <blitz/indexexpr.h>)
30 * - Stack traversal also scans through the destination array in storage
31 * order. However, push/pop stack iterators are used.
32 * - Fast traversal follows a Hilbert (or other) space-filling curve to
33 * improve cache reuse for stencilling operations. Currently, the
34 * space filling curves must be generated by calling
35 * generateFastTraversalOrder(TinyVector<int,N_dimensions>).
36 * - 2D tiled traversal follows a tiled traversal, to improve cache reuse
37 * for 2D stencils. Space filling curves have too much overhead to use
38 * in two-dimensions.
39 *
40 * _bz_tryFastTraversal is a helper class. Fast traversals are only
41 * attempted if the expression looks like a stencil -- it's at least
42 * three-dimensional, has at least six array operands, and there are
43 * no index placeholders in the expression. These are all things which
44 * can be checked at compile time, so the if()/else() syntax has been
45 * replaced with this class template.
46 */
47
48// Fast traversals require <set> from the ISO/ANSI C++ standard library
49#ifdef BZ_HAVE_STD
50
51template<_bz_bool canTryFastTraversal>
52struct _bz_tryFastTraversal {
53 template<class T_numtype, int N_rank, class T_expr, class T_update>
54 static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
55 _bz_ArrayExpr<T_expr> expr, T_update)
56 {
57 return _bz_false;
58 }
59};
60
61template<>
62struct _bz_tryFastTraversal<_bz_true> {
63 template<class T_numtype, int N_rank, class T_expr, class T_update>
64 static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
65 _bz_ArrayExpr<T_expr> expr, T_update)
66 {
67 // See if there's an appropriate space filling curve available.
68 // Currently fast traversals use an N-1 dimensional curve. The
69 // Nth dimension column corresponding to each point on the curve
70 // is traversed in the normal fashion.
71 TraversalOrderCollection<N_rank-1> traversals;
72 TinyVector<int, N_rank - 1> traversalGridSize;
73
74 for (int i=0; i < N_rank - 1; ++i)
75 traversalGridSize[i] = array.length(array.ordering(i+1));
76
77#ifdef BZ_DEBUG_TRAVERSE
78cout << "traversalGridSize = " << traversalGridSize << endl;
79cout.flush();
80#endif
81
82 const TraversalOrder<N_rank-1>* order =
83 traversals.find(traversalGridSize);
84
85 if (order)
86 {
87#ifdef BZ_DEBUG_TRAVERSE
88 cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
89 << ", " << N_rank << ">: Using stack traversal" << endl;
90#endif
91 // A curve was available -- use fast traversal.
92 array.evaluateWithFastTraversal(*order, expr, T_update());
93 return _bz_true;
94 }
95
96 return _bz_false;
97 }
98};
99
100#endif // BZ_HAVE_STD
101
102template<class T_numtype, int N_rank> template<class T_expr, class T_update>
103inline Array<T_numtype, N_rank>&
104Array<T_numtype, N_rank>::evaluate(_bz_ArrayExpr<T_expr> expr, T_update)
105{
106 // Check that all arrays have the same shape
107#ifdef BZ_DEBUG
108 if (!expr.shapeCheck(shape()))
109 {
110 if (assertFailMode == _bz_false)
111 {
112 cerr << "[Blitz++] Shape check failed: Module " << __FILE__
113 << " line " << __LINE__ << endl
114 << " Expression: ";
115 prettyPrintFormat format(_bz_true); // Use terse formatting
116 string str;
117 expr.prettyPrint(str, format);
118 cerr << str << endl ;
119 }
120
121#if 0
122// Shape dumping is broken by change to using string for prettyPrint
123 << " Shapes: " << shape() << " = ";
124 prettyPrintFormat format2;
125 format2.setDumpArrayShapesMode();
126 expr.prettyPrint(cerr, format2);
127 cerr << endl;
128#endif
129 BZ_PRE_FAIL;
130 }
131#endif
132
133 BZPRECHECK(expr.shapeCheck(shape()),
134 "Shape check failed." << endl << "Expression:");
135
136 BZPRECHECK((T_expr::rank == N_rank) || (T_expr::numArrayOperands == 0),
137 "Assigned rank " << T_expr::rank << " expression to rank "
138 << N_rank << " array.");
139
140#ifdef BZ_DEBUG_TRAVERSE
141cout << "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders
142 << endl; cout.flush();
143#endif
144
145 // Tau profiling code. Provide Tau with a pretty-printed version of
146 // the expression.
147 // NEEDS_WORK-- use a static initializer somehow.
148
149#ifdef BZ_TAU_PROFILING
150 static string exprDescription;
151 if (!exprDescription.length()) // faked static initializer
152 {
153 exprDescription = "A";
154 prettyPrintFormat format(_bz_true); // Terse mode on
155 format.nextArrayOperandSymbol();
156 T_update::prettyPrint(exprDescription);
157 expr.prettyPrint(exprDescription, format);
158 }
159 TAU_PROFILE(" ", exprDescription, TAU_BLITZ);
160#endif
161
162 // Determine which evaluation mechanism to use
163 if (T_expr::numIndexPlaceholders > 0)
164 {
165 // The expression involves index placeholders, so have to
166 // use index traversal rather than stack traversal.
167
168 if (N_rank == 1)
169 return evaluateWithIndexTraversal1(expr, T_update());
170 else
171 return evaluateWithIndexTraversalN(expr, T_update());
172 }
173 else {
174
175 // If this expression looks like an array stencil, then attempt to
176 // use a fast traversal order.
177 // Fast traversals require <set> from the ISO/ANSI C++ standard
178 // library.
179
180#ifdef BZ_HAVE_STD
181
182 enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)
183 && (T_expr::numIndexPlaceholders == 0) };
184
185 if (_bz_tryFastTraversal<isStencil>::tryFast(*this, expr, T_update()))
186 return *this;
187
188#endif
189
190#ifdef BZ_ARRAY_2D_STENCIL_TILING
191 // Does this look like a 2-dimensional stencil on a largeish
192 // array?
193
194 if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))
195 {
196 // Use a heuristic to determine whether a tiled traversal
197 // is desirable. First, estimate how much L1 cache is needed
198 // to achieve a high hit rate using the stack traversal.
199 // Try to err on the side of using tiled traversal even when
200 // it isn't strictly needed.
201
202 // Assumptions:
203 // Stencil width 3
204 // 3 arrays involved in stencil
205 // Uniform data type in arrays (all T_numtype)
206
207 int cacheNeeded = 3 * 3 * sizeof(T_numtype) * length(ordering(0));
208 if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE)
209 return evaluateWithTiled2DTraversal(expr, T_update());
210 }
211
212#endif
213
214 // If fast traversal isn't available or appropriate, then just
215 // do a stack traversal.
216 if (N_rank == 1)
217 return evaluateWithStackTraversal1(expr, T_update());
218 else
219 return evaluateWithStackTraversalN(expr, T_update());
220 }
221}
222
223template<class T_numtype, int N_rank> template<class T_expr, class T_update>
224inline Array<T_numtype, N_rank>&
225Array<T_numtype, N_rank>::evaluateWithStackTraversal1(_bz_ArrayExpr<T_expr>
226 expr, T_update)
227{
228#ifdef BZ_DEBUG_TRAVERSE
229 BZ_DEBUG_MESSAGE("Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
230 << ", " << N_rank << ">: Using stack traversal");
231#endif
232 ArrayIterator<T_numtype, N_rank> iter(*this);
233 iter.loadStride(firstRank);
234 expr.loadStride(firstRank);
235
236 _bz_bool useUnitStride = iter.isUnitStride(firstRank)
237 && expr.isUnitStride(firstRank);
238
239#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
240 int commonStride = expr.suggestStride(firstRank);
241 if (iter.suggestStride(firstRank) > commonStride)
242 commonStride = iter.suggestStride(firstRank);
243 bool useCommonStride = iter.isStride(firstRank,commonStride)
244 && expr.isStride(firstRank,commonStride);
245
246 #ifdef BZ_DEBUG_TRAVERSE
247 BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE:" << endl
248 << " commonStride = " << commonStride << " useCommonStride = "
249 << useCommonStride);
250 #endif
251#else
252 int commonStride = 1;
253 bool useCommonStride = _bz_false;
254#endif
255
256 const T_numtype * last = iter.data() + length(firstRank)
257 * stride(firstRank);
258
259 if (useUnitStride || useCommonStride)
260 {
261#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
262
263#ifdef BZ_DEBUG_TRAVERSE
264 BZ_DEBUG_MESSAGE("BZ_USE_FAST_READ_ARRAY_EXPR with commonStride");
265#endif
266 int ubound = length(firstRank) * commonStride;
267 T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
268
269 if (commonStride == 1)
270 {
271 #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
272 for (int i=0; i < ubound; ++i)
273 T_update::update(data[i], expr.fastRead(i));
274 #else
275 int n1 = ubound & 3;
276 int i = 0;
277 for (; i < n1; ++i)
278 T_update::update(data[i], expr.fastRead(i));
279
280 for (; i < ubound; i += 4)
281 {
282#ifndef BZ_ARRAY_STACK_TRAVERSAL_CSE_AND_ANTIALIAS
283 T_update::update(data[i], expr.fastRead(i));
284 T_update::update(data[i+1], expr.fastRead(i+1));
285 T_update::update(data[i+2], expr.fastRead(i+2));
286 T_update::update(data[i+3], expr.fastRead(i+3));
287#else
288 int t1 = i+1;
289 int t2 = i+2;
290 int t3 = i+3;
291
292 _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3, tmp4;
293
294 tmp1 = expr.fastRead(i);
295 tmp2 = expr.fastRead(BZ_NO_PROPAGATE(t1));
296 tmp3 = expr.fastRead(BZ_NO_PROPAGATE(t2));
297 tmp4 = expr.fastRead(BZ_NO_PROPAGATE(t3));
298
299 T_update::update(data[i], BZ_NO_PROPAGATE(tmp1));
300 T_update::update(data[BZ_NO_PROPAGATE(t1)], tmp2);
301 T_update::update(data[BZ_NO_PROPAGATE(t2)], tmp3);
302 T_update::update(data[BZ_NO_PROPAGATE(t3)], tmp4);
303#endif
304 }
305 #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
306
307 }
308 #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
309 else {
310
311 #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
312 for (int i=0; i < ubound; i += commonStride)
313 T_update::update(data[i], expr.fastRead(i));
314 #else
315 int n1 = (length(firstRank) & 3) * commonStride;
316
317 int i = 0;
318 for (; i < n1; i += commonStride)
319 T_update::update(data[i], expr.fastRead(i));
320
321 int strideInc = 4 * commonStride;
322 for (; i < ubound; i += strideInc)
323 {
324 T_update::update(data[i], expr.fastRead(i));
325 int i2 = i + commonStride;
326 T_update::update(data[i2], expr.fastRead(i2));
327 int i3 = i + 2 * commonStride;
328 T_update::update(data[i3], expr.fastRead(i3));
329 int i4 = i + 3 * commonStride;
330 T_update::update(data[i4], expr.fastRead(i4));
331 }
332 #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
333 }
334 #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
335
336#else // ! BZ_USE_FAST_READ_ARRAY_EXPR
337
338#ifdef BZ_DEBUG_TRAVERSE
339 BZ_DEBUG_MESSAGE("Common stride, no fast read");
340#endif
341 while (iter.data() != last)
342 {
343 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
344 iter.advance(commonStride);
345 expr.advance(commonStride);
346 }
347#endif
348 }
349 else {
350 while (iter.data() != last)
351 {
352 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
353 iter.advance();
354 expr.advance();
355 }
356 }
357
358 return *this;
359}
360
361template<class T_numtype, int N_rank> template<class T_expr, class T_update>
362inline Array<T_numtype, N_rank>&
363Array<T_numtype, N_rank>::evaluateWithStackTraversalN(_bz_ArrayExpr<T_expr>
364 expr, T_update)
365{
366 /*
367 * A stack traversal replaces the usual nested loops:
368 *
369 * for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
370 * for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
371 * for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
372 * A(i,j,k) = 0;
373 *
374 * with a stack data structure. The stack allows this single
375 * routine to replace any number of nested loops.
376 *
377 * For each dimension (loop), these quantities are needed:
378 * - a pointer to the first element encountered in the loop
379 * - the stride associated with the dimension/loop
380 * - a pointer to the last element encountered in the loop
381 *
382 * The basic idea is that entering each loop is a "push" onto the
383 * stack, and exiting each loop is a "pop". In practice, this
384 * routine treats accesses the stack in a random-access way,
385 * which confuses the picture a bit. But conceptually, that's
386 * what is going on.
387 */
388
389 /*
390 * ordering(0) gives the dimension associated with the smallest
391 * stride (usually; the exceptions have to do with subarrays and
392 * are uninteresting). We call this dimension maxRank; it will
393 * become the innermost "loop".
394 *
395 * Ordering the loops from ordering(N_rank-1) down to
396 * ordering(0) ensures that the largest stride is associated
397 * with the outermost loop, and the smallest stride with the
398 * innermost. This is critical for good performance on
399 * cached machines.
400 */
401 const int maxRank = ordering(0);
402 const int secondLastRank = ordering(1);
403
404 // Create an iterator for the array receiving the result
405 ArrayIterator<T_numtype, N_rank> iter(*this);
406
407 // Set the initial stack configuration by pushing the pointer
408 // to the first element of the array onto the stack N times.
409
410 int i;
411 for (i=1; i < N_rank; ++i)
412 {
413 iter.push(i);
414 expr.push(i);
415 }
416
417 // Load the strides associated with the innermost loop.
418 iter.loadStride(maxRank);
419 expr.loadStride(maxRank);
420
421 /*
422 * Is the stride in the innermost loop equal to 1? If so,
423 * we might take advantage of this and generate more
424 * efficient code.
425 */
426 _bz_bool useUnitStride = iter.isUnitStride(maxRank)
427 && expr.isUnitStride(maxRank);
428
429 /*
430 * Do all array operands share a common stride in the innermost
431 * loop? If so, we can generate more efficient code (but only
432 * if this optimization has been enabled).
433 */
434#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
435 int commonStride = expr.suggestStride(maxRank);
436 if (iter.suggestStride(maxRank) > commonStride)
437 commonStride = iter.suggestStride(maxRank);
438 bool useCommonStride = iter.isStride(maxRank,commonStride)
439 && expr.isStride(maxRank,commonStride);
440
441#ifdef BZ_DEBUG_TRAVERSE
442 BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl
443 << "commonStride = " << commonStride << " useCommonStride = "
444 << useCommonStride);
445#endif
446
447#else
448 int commonStride = 1;
449 bool useCommonStride = _bz_false;
450#endif
451
452 /*
453 * The "last" array contains a pointer to the last element
454 * encountered in each "loop".
455 */
456 const T_numtype* _bz_restrict last[N_rank];
457
458 // Set up the initial state of the "last" array
459 for (i=1; i < N_rank; ++i)
460 last[i] = iter.data() + length(ordering(i)) * stride(ordering(i));
461
462 int lastLength = length(maxRank);
463 int firstNoncollapsedLoop = 1;
464
465#ifdef BZ_COLLAPSE_LOOPS
466
467 /*
468 * This bit of code handles collapsing loops. When possible,
469 * the N nested loops are converted into a single loop (basically,
470 * the N-dimensional array is treated as a long vector).
471 * This is important for cases where the length of the innermost
472 * loop is very small, for example a 100x100x3 array.
473 * If this code can't collapse all the loops into a single loop,
474 * it will collapse as many loops as possible starting from the
475 * innermost and working out.
476 */
477
478 // Collapse loops when possible
479 for (i=1; i < N_rank; ++i)
480 {
481 // Figure out which pair of loops we are considering combining.
482 int outerLoopRank = ordering(i);
483 int innerLoopRank = ordering(i-1);
484
485 /*
486 * The canCollapse() routines look at the strides and extents
487 * of the loops, and determine if they can be combined into
488 * one loop.
489 */
490
491 if (canCollapse(outerLoopRank,innerLoopRank)
492 && expr.canCollapse(outerLoopRank,innerLoopRank))
493 {
494 lastLength *= length(outerLoopRank);
495 firstNoncollapsedLoop = i+1;
496 }
497 }
498#endif // BZ_COLLAPSE_LOOPS
499
500 /*
501 * Now we actually perform the loops. This while loop contains
502 * two parts: first, the innermost loop is performed. Then we
503 * exit the loop, and pop our way down the stack until we find
504 * a loop that isn't completed. We then restart the inner loops
505 * and push them onto the stack.
506 */
507
508 while (true) {
509
510 /*
511 * This bit of code handles the innermost loop. It would look
512 * a lot simpler if it weren't for unit stride and common stride
513 * optimizations; these clutter up the code with multiple versions.
514 */
515
516 if ((useUnitStride) || (useCommonStride))
517 {
518 T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
519 + lastLength;
520
521#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
522
523 /*
524 * The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably
525 * be taken out. This was put in place while the unit stride/
526 * common stride optimizations were being implemented and
527 * tested.
528 */
529
530 // Calculate the end of the innermost loop
531 int ubound = lastLength * commonStride;
532
533 /*
534 * This is a real kludge. I didn't want to have to write
535 * a const and non-const version of ArrayIterator, so I use a
536 * const iterator and cast away const. This could
537 * probably be avoided with some trick, but the whole routine
538 * is ugly, so why bother.
539 */
540
541 T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
542
543 /*
544 * BZ_NEEDS_WORK-- need to implement optional unrolling.
545 */
546 if (commonStride == 1)
547 {
548 for (int i=0; i < ubound; ++i)
549 T_update::update(data[i], expr.fastRead(i));
550 }
551#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
552 else {
553 for (int i=0; i < ubound; i += commonStride)
554 T_update::update(data[i], expr.fastRead(i));
555 }
556#endif
557 /*
558 * Tidy up for the fact that we haven't actually been
559 * incrementing the iterators in the innermost loop, by
560 * faking it afterward.
561 */
562 iter.advance(lastLength * commonStride);
563 expr.advance(lastLength * commonStride);
564#else
565 // !BZ_USE_FAST_READ_ARRAY_EXPR
566 // This bit of code not really needed; should remove at some
567 // point, along with the test for BZ_USE_FAST_READ_ARRAY_EXPR
568
569 while (iter.data() != end)
570 {
571 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
572 iter.advance(commonStride);
573 expr.advance(commonStride);
574 }
575#endif
576 }
577 else {
578 /*
579 * We don't have a unit stride or common stride in the innermost
580 * loop. This is going to hurt performance. Luckily 95% of
581 * the time, we hit the cases above.
582 */
583 T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
584 + lastLength * stride(maxRank);
585
586 while (iter.data() != end)
587 {
588 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
589 iter.advance();
590 expr.advance();
591 }
592 }
593
594
595 /*
596 * We just finished the innermost loop. Now we pop our way down
597 * the stack, until we hit a loop that hasn't completed yet.
598 */
599 int j = firstNoncollapsedLoop;
600 for (; j < N_rank; ++j)
601 {
602 // Get the next loop
603 int r = ordering(j);
604
605 // Pop-- this restores the data pointers to the first element
606 // encountered in the loop.
607 iter.pop(j);
608 expr.pop(j);
609
610 // Load the stride associated with this loop, and increment
611 // once.
612 iter.loadStride(r);
613 expr.loadStride(r);
614 iter.advance();
615 expr.advance();
616
617 // If we aren't at the end of this loop, then stop popping.
618 if (iter.data() != last[j])
619 break;
620 }
621
622 // Are we completely done?
623 if (j == N_rank)
624 break;
625
626 // No, so push all the inner loops back onto the stack.
627 for (; j >= firstNoncollapsedLoop; --j)
628 {
629 int r2 = ordering(j-1);
630 iter.push(j);
631 expr.push(j);
632 last[j-1] = iter.data() + length(r2) * stride(r2);
633 }
634
635 // Load the stride for the innermost loop again.
636 iter.loadStride(maxRank);
637 expr.loadStride(maxRank);
638 }
639
640 return *this;
641}
642
643template<class T_numtype, int N_rank> template<class T_expr, class T_update>
644inline Array<T_numtype, N_rank>&
645Array<T_numtype, N_rank>::evaluateWithIndexTraversal1(_bz_ArrayExpr<T_expr>
646 expr, T_update)
647{
648 TinyVector<int,N_rank> index;
649
650 if (stride(firstRank) == 1)
651 {
652 T_numtype * _bz_restrict iter = data_;
653 int last = ubound(firstRank);
654
655 for (index[0] = lbound(firstRank); index[0] <= last;
656 ++index[0])
657 {
658 iter[index[0]] = expr(index);
659 }
660 }
661 else {
662 ArrayIterator<T_numtype, N_rank> iter(*this);
663 iter.loadStride(0);
664 int last = ubound(firstRank);
665
666 for (index[0] = lbound(firstRank); index[0] <= last;
667 ++index[0])
668 {
669 T_update::update(*const_cast<T_numtype*>(iter.data()),
670 expr(index));
671 iter.advance();
672 }
673 }
674
675 return *this;
676}
677
678template<class T_numtype, int N_rank> template<class T_expr, class T_update>
679inline Array<T_numtype, N_rank>&
680Array<T_numtype, N_rank>::evaluateWithIndexTraversalN(_bz_ArrayExpr<T_expr>
681 expr, T_update)
682{
683 // Do a stack-type traversal for the destination array and use
684 // index traversal for the source expression
685
686 const int maxRank = ordering(0);
687 const int secondLastRank = ordering(1);
688
689#ifdef BZ_DEBUG_TRAVERSE
690cout << "Index traversal: N_rank = " << N_rank << endl;
691cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
692 << endl;
693cout.flush();
694#endif
695
696 ArrayIterator<T_numtype, N_rank> iter(*this);
697 for (int i=1; i < N_rank; ++i)
698 iter.push(ordering(i));
699
700 iter.loadStride(maxRank);
701
702 TinyVector<int,N_rank> index, last;
703
704 index = storage_.base();
705 last = storage_.base() + length_;
706
707 int lastLength = length(maxRank);
708
709 while (true) {
710
711 for (index[maxRank] = base(maxRank);
712 index[maxRank] < last[maxRank];
713 ++index[maxRank])
714 {
715#ifdef BZ_DEBUG_TRAVERSE
716#if 0
717 cout << "(" << index[0] << "," << index[1] << ") " << endl;
718 cout.flush();
719#endif
720#endif
721
722 T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index));
723 iter.advance();
724 }
725
726 int j = 1;
727 for (; j < N_rank; ++j)
728 {
729 iter.pop(ordering(j));
730 iter.loadStride(ordering(j));
731 iter.advance();
732
733 index[ordering(j-1)] = base(ordering(j-1));
734 ++index[ordering(j)];
735 if (index[ordering(j)] != last[ordering(j)])
736 break;
737 }
738
739 if (j == N_rank)
740 break;
741
742 for (; j > 0; --j)
743 {
744 iter.push(ordering(j));
745 }
746 iter.loadStride(maxRank);
747 }
748
749 return *this;
750}
751
752// Fast traversals require <set> from the ISO/ANSI C++ standard library
753
754#ifdef BZ_HAVE_STD
755
756template<class T_numtype, int N_rank> template<class T_expr, class T_update>
757inline Array<T_numtype, N_rank>&
758Array<T_numtype, N_rank>::evaluateWithFastTraversal(
759 const TraversalOrder<N_rank - 1>& order, _bz_ArrayExpr<T_expr> expr,
760 T_update)
761{
762 const int maxRank = ordering(0);
763 const int secondLastRank = ordering(1);
764
765#ifdef BZ_DEBUG_TRAVERSE
766cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
767 << endl;
768#endif
769
770 ArrayIterator<T_numtype, N_rank> iter(*this);
771 iter.push(0);
772 expr.push(0);
773
774 _bz_bool useUnitStride = iter.isUnitStride(maxRank)
775 && expr.isUnitStride(maxRank);
776
777#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
778 int commonStride = expr.suggestStride(maxRank);
779 if (iter.suggestStride(maxRank) > commonStride)
780 commonStride = iter.suggestStride(maxRank);
781 bool useCommonStride = iter.isStride(maxRank,commonStride)
782 && expr.isStride(maxRank,commonStride);
783#else
784 int commonStride = 1;
785 bool useCommonStride = _bz_false;
786#endif
787
788 int lastLength = length(maxRank);
789
790 for (int i=0; i < order.length(); ++i)
791 {
792 iter.pop(0);
793 expr.pop(0);
794
795#ifdef BZ_DEBUG_TRAVERSE
796 cerr << "Traversing: " << order[i] << endl;
797#endif
798 // Position the iterator at the start of the next column
799 for (int j=1; j < N_rank; ++j)
800 {
801 iter.loadStride(ordering(j));
802 expr.loadStride(ordering(j));
803
804 int offset = order[i][j-1];
805 iter.advance(offset);
806 expr.advance(offset);
807 }
808
809 iter.loadStride(maxRank);
810 expr.loadStride(maxRank);
811
812 // Evaluate the expression along the column
813
814 if ((useUnitStride) || (useCommonStride))
815 {
816 T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
817 + lastLength * commonStride;
818
819#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
820 int ubound = lastLength * commonStride;
821 T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
822
823 if (commonStride == 1)
824 {
825 #ifndef BZ_ARRAY_FAST_TRAVERSAL_UNROLL
826 for (int i=0; i < ubound; ++i)
827 T_update::update(data[i], expr.fastRead(i));
828 #else
829 int n1 = ubound & 3;
830 int i=0;
831 for (; i < n1; ++i)
832 T_update::update(data[i], expr.fastRead(i));
833
834 for (; i < ubound; i += 4)
835 {
836 T_update::update(data[i], expr.fastRead(i));
837 T_update::update(data[i+1], expr.fastRead(i+1));
838 T_update::update(data[i+2], expr.fastRead(i+2));
839 T_update::update(data[i+3], expr.fastRead(i+3));
840 }
841 #endif // BZ_ARRAY_FAST_TRAVERSAL_UNROLL
842 }
843 #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
844 else {
845 for (int i=0; i < ubound; i += commonStride)
846 T_update::update(data[i], expr.fastRead(i));
847 }
848 #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
849
850 iter.advance(lastLength * commonStride);
851 expr.advance(lastLength * commonStride);
852#else // ! BZ_USE_FAST_READ_ARRAY_EXPR
853 while (iter.data() != last)
854 {
855 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
856 iter.advance(commonStride);
857 expr.advance(commonStride);
858 }
859#endif // BZ_USE_FAST_READ_ARRAY_EXPR
860
861 }
862 else {
863 // No common stride
864
865 T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
866 + lastLength * stride(maxRank);
867
868 while (iter.data() != last)
869 {
870 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
871 iter.advance();
872 expr.advance();
873 }
874 }
875 }
876
877 return *this;
878}
879#endif // BZ_HAVE_STD
880
881#ifdef BZ_ARRAY_2D_NEW_STENCIL_TILING
882
883#ifdef BZ_ARRAY_2D_STENCIL_TILING
884
885template<class T_numtype, int N_rank> template<class T_expr, class T_update>
886inline Array<T_numtype, N_rank>&
887Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
888 expr, T_update)
889{
890 const int minorRank = ordering(0);
891 const int majorRank = ordering(1);
892
893 ArrayIterator<T_numtype, N_rank> iter(*this);
894 iter.push(0);
895 expr.push(0);
896
897#ifdef BZ_2D_STENCIL_DEBUG
898 int count = 0;
899#endif
900
901 _bz_bool useUnitStride = iter.isUnitStride(minorRank)
902 && expr.isUnitStride(minorRank);
903
904#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
905 int commonStride = expr.suggestStride(minorRank);
906 if (iter.suggestStride(minorRank) > commonStride)
907 commonStride = iter.suggestStride(minorRank);
908 bool useCommonStride = iter.isStride(minorRank,commonStride)
909 && expr.isStride(minorRank,commonStride);
910#else
911 int commonStride = 1;
912 bool useCommonStride = _bz_false;
913#endif
914
915 // Determine if a common major stride exists
916 int commonMajorStride = expr.suggestStride(majorRank);
917 if (iter.suggestStride(majorRank) > commonMajorStride)
918 commonMajorStride = iter.suggestStride(majorRank);
919 bool haveCommonMajorStride = iter.isStride(majorRank,commonMajorStride)
920 && expr.isStride(majorRank,commonMajorStride);
921
922
923 int maxi = length(majorRank);
924 int maxj = length(minorRank);
925
926 const int tileHeight = 16, tileWidth = 3;
927
928 int bi, bj;
929 for (bi=0; bi < maxi; bi += tileHeight)
930 {
931 int ni = bi + tileHeight;
932 if (ni > maxi)
933 ni = maxi;
934
935 // Move back to the beginning of the array
936 iter.pop(0);
937 expr.pop(0);
938
939 // Move to the start of this tile row
940 iter.loadStride(majorRank);
941 iter.advance(bi);
942 expr.loadStride(majorRank);
943 expr.advance(bi);
944
945 // Save this position
946 iter.push(1);
947 expr.push(1);
948
949 for (bj=0; bj < maxj; bj += tileWidth)
950 {
951 // Move to the beginning of the tile row
952 iter.pop(1);
953 expr.pop(1);
954
955 // Move to the top of the current tile (bi,bj)
956 iter.loadStride(minorRank);
957 iter.advance(bj);
958 expr.loadStride(minorRank);
959 expr.advance(bj);
960
961 if (bj + tileWidth <= maxj)
962 {
963 // Strip mining
964
965 if ((useUnitStride) && (haveCommonMajorStride))
966 {
967 int offset = 0;
968 T_numtype* _bz_restrict data = const_cast<T_numtype*>
969 (iter.data());
970
971 for (int i=bi; i < ni; ++i)
972 {
973 _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3;
974
975 // Common subexpression elimination -- compilers
976 // won't necessarily do this on their own.
977 int t1 = offset+1;
978 int t2 = offset+2;
979
980 tmp1 = expr.fastRead(offset);
981 tmp2 = expr.fastRead(t1);
982 tmp3 = expr.fastRead(t2);
983
984 T_update::update(data[0], tmp1);
985 T_update::update(data[1], tmp2);
986 T_update::update(data[2], tmp3);
987
988 offset += commonMajorStride;
989 data += commonMajorStride;
990
991#ifdef BZ_2D_STENCIL_DEBUG
992 count += 3;
993#endif
994 }
995 }
996 else {
997
998 for (int i=bi; i < ni; ++i)
999 {
1000 iter.loadStride(minorRank);
1001 expr.loadStride(minorRank);
1002
1003 // Loop through current row elements
1004 T_update::update(*const_cast<T_numtype*>(iter.data()),
1005 *expr);
1006 iter.advance();
1007 expr.advance();
1008
1009 T_update::update(*const_cast<T_numtype*>(iter.data()),
1010 *expr);
1011 iter.advance();
1012 expr.advance();
1013
1014 T_update::update(*const_cast<T_numtype*>(iter.data()),
1015 *expr);
1016 iter.advance(-2);
1017 expr.advance(-2);
1018
1019 iter.loadStride(majorRank);
1020 expr.loadStride(majorRank);
1021 iter.advance();
1022 expr.advance();
1023
1024#ifdef BZ_2D_STENCIL_DEBUG
1025 count += 3;
1026#endif
1027
1028 }
1029 }
1030 }
1031 else {
1032
1033 // This code handles partial tiles at the bottom of the
1034 // array.
1035
1036 for (int j=bj; j < maxj; ++j)
1037 {
1038 iter.loadStride(majorRank);
1039 expr.loadStride(majorRank);
1040
1041 for (int i=bi; i < ni; ++i)
1042 {
1043 T_update::update(*const_cast<T_numtype*>(iter.data()),
1044 *expr);
1045 iter.advance();
1046 expr.advance();
1047#ifdef BZ_2D_STENCIL_DEBUG
1048 ++count;
1049#endif
1050
1051 }
1052
1053 // Move back to the top of this column
1054 iter.advance(bi-ni);
1055 expr.advance(bi-ni);
1056
1057 // Move over to the next column
1058 iter.loadStride(minorRank);
1059 expr.loadStride(minorRank);
1060
1061 iter.advance();
1062 expr.advance();
1063 }
1064 }
1065 }
1066 }
1067
1068#ifdef BZ_2D_STENCIL_DEBUG
1069 cout << "BZ_2D_STENCIL_DEBUG: count = " << count << endl;
1070#endif
1071
1072 return *this;
1073}
1074
1075#endif // BZ_ARRAY_2D_STENCIL_TILING
1076#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
1077
1078
1079
1080#ifndef BZ_ARRAY_2D_NEW_STENCIL_TILING
1081
1082#ifdef BZ_ARRAY_2D_STENCIL_TILING
1083
1084template<class T_numtype, int N_rank> template<class T_expr, class T_update>
1085inline Array<T_numtype, N_rank>&
1086Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
1087 expr, T_update)
1088{
1089 const int minorRank = ordering(0);
1090 const int majorRank = ordering(1);
1091
1092 const int blockSize = 16;
1093
1094 ArrayIterator<T_numtype, N_rank> iter(*this);
1095 iter.push(0);
1096 expr.push(0);
1097
1098 _bz_bool useUnitStride = iter.isUnitStride(minorRank)
1099 && expr.isUnitStride(minorRank);
1100
1101#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1102 int commonStride = expr.suggestStride(minorRank);
1103 if (iter.suggestStride(minorRank) > commonStride)
1104 commonStride = iter.suggestStride(minorRank);
1105 bool useCommonStride = iter.isStride(minorRank,commonStride)
1106 && expr.isStride(minorRank,commonStride);
1107#else
1108 int commonStride = 1;
1109 bool useCommonStride = _bz_false;
1110#endif
1111
1112 int maxi = length(majorRank);
1113 int maxj = length(minorRank);
1114
1115 int bi, bj;
1116 for (bi=0; bi < maxi; bi += blockSize)
1117 {
1118 int ni = bi + blockSize;
1119 if (ni > maxi)
1120 ni = maxi;
1121
1122 for (bj=0; bj < maxj; bj += blockSize)
1123 {
1124 int nj = bj + blockSize;
1125 if (nj > maxj)
1126 nj = maxj;
1127
1128 // Move to the beginning of the array
1129 iter.pop(0);
1130 expr.pop(0);
1131
1132 // Move to the beginning of the tile (bi,bj)
1133 iter.loadStride(majorRank);
1134 iter.advance(bi);
1135 iter.loadStride(minorRank);
1136 iter.advance(bj);
1137
1138 expr.loadStride(majorRank);
1139 expr.advance(bi);
1140 expr.loadStride(minorRank);
1141 expr.advance(bj);
1142
1143 // Loop through tile rows
1144 for (int i=bi; i < ni; ++i)
1145 {
1146 // Save the beginning of this tile row
1147 iter.push(1);
1148 expr.push(1);
1149
1150 // Load the minor stride
1151 iter.loadStride(minorRank);
1152 expr.loadStride(minorRank);
1153
1154 if (useUnitStride)
1155 {
1156 T_numtype* _bz_restrict data = const_cast<T_numtype*>
1157 (iter.data());
1158
1159 int ubound = (nj-bj);
1160 for (int j=0; j < ubound; ++j)
1161 T_update::update(data[j], expr.fastRead(j));
1162 }
1163#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1164 else if (useCommonStride)
1165 {
1166 int ubound = (nj-bj) * commonStride;
1167 T_numtype* _bz_restrict data = const_cast<T_numtype*>
1168 (iter.data());
1169
1170 for (int j=0; j < ubound; j += commonStride)
1171 T_update::update(data[j], expr.fastRead(j));
1172 }
1173#endif
1174 else {
1175 for (int j=bj; j < nj; ++j)
1176 {
1177 // Loop through current row elements
1178 T_update::update(*const_cast<T_numtype*>(iter.data()),
1179 *expr);
1180 iter.advance();
1181 expr.advance();
1182 }
1183 }
1184
1185 // Move back to the beginning of the tile row, then
1186 // move to the next row
1187 iter.pop(1);
1188 iter.loadStride(majorRank);
1189 iter.advance(1);
1190
1191 expr.pop(1);
1192 expr.loadStride(majorRank);
1193 expr.advance(1);
1194 }
1195 }
1196 }
1197
1198 return *this;
1199}
1200#endif // BZ_ARRAY_2D_STENCIL_TILING
1201#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
1202
1203BZ_NAMESPACE_END
1204
1205#endif // BZ_ARRAYEVAL_CC
1206
Note: See TracBrowser for help on using the repository browser.