source: Sophya/trunk/Poubelle/DPC:FitsIOServer/Blitz/blitz/array/eval.cc@ 3945

Last change on this file since 3945 was 658, checked in by ansari, 26 years ago

no message

File size: 37.7 KB
RevLine 
[658]1/*
2 * $Id: eval.cc,v 1.1.1.1 1999-11-26 16:37:06 ansari Exp $
3 *
4 * $Log: not supported by cvs2svn $
5// Revision 1.1.1.1 1999/04/09 17:59:03 ansari
6// Creation module DPC/Blitz (blitz 0.4) Reza 09/04/99
7//
8 * Revision 1.2 1998/03/14 00:04:47 tveldhui
9 * 0.2-alpha-05
10 *
11 * Revision 1.1 1998/02/25 20:04:01 tveldhui
12 * Initial revision
13 *
14 */
15
16#ifndef BZ_ARRAYEVAL_CC
17#define BZ_ARRAYEVAL_CC
18
19#ifndef BZ_ARRAY_H
20 #error <blitz/array/eval.cc> must be included via <blitz/array.h>
21#endif
22
23BZ_NAMESPACE(blitz)
24
25/*
26 * Assign an expression to an array. For performance reasons, there are
27 * several traversal mechanisms:
28 *
29 * - Index traversal scans through the destination array in storage order.
30 * The expression is evaluated using a TinyVector<int,N> operand. This
31 * version is used only when there are index placeholders in the expression
32 * (see <blitz/indexexpr.h>)
33 * - Stack traversal also scans through the destination array in storage
34 * order. However, push/pop stack iterators are used.
35 * - Fast traversal follows a Hilbert (or other) space-filling curve to
36 * improve cache reuse for stencilling operations. Currently, the
37 * space filling curves must be generated by calling
38 * generateFastTraversalOrder(TinyVector<int,N_dimensions>).
39 * - 2D tiled traversal follows a tiled traversal, to improve cache reuse
40 * for 2D stencils. Space filling curves have too much overhead to use
41 * in two-dimensions.
42 *
43 * _bz_tryFastTraversal is a helper class. Fast traversals are only
44 * attempted if the expression looks like a stencil -- it's at least
45 * three-dimensional, has at least six array operands, and there are
46 * no index placeholders in the expression. These are all things which
47 * can be checked at compile time, so the if()/else() syntax has been
48 * replaced with this class template.
49 */
50
51// Fast traversals require <set> from the ISO/ANSI C++ standard library
52#ifdef BZ_HAVE_STD
53
54template<_bz_bool canTryFastTraversal>
55struct _bz_tryFastTraversal {
56 template<class T_numtype, int N_rank, class T_expr, class T_update>
57 static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
58 _bz_ArrayExpr<T_expr> expr, T_update)
59 {
60 return _bz_false;
61 }
62};
63
64template<>
65struct _bz_tryFastTraversal<_bz_true> {
66 template<class T_numtype, int N_rank, class T_expr, class T_update>
67 static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
68 _bz_ArrayExpr<T_expr> expr, T_update)
69 {
70 // See if there's an appropriate space filling curve available.
71 // Currently fast traversals use an N-1 dimensional curve. The
72 // Nth dimension column corresponding to each point on the curve
73 // is traversed in the normal fashion.
74 TraversalOrderCollection<N_rank-1> traversals;
75 TinyVector<int, N_rank - 1> traversalGridSize;
76
77 for (int i=0; i < N_rank - 1; ++i)
78 traversalGridSize[i] = array.length(array.ordering(i+1));
79
80#ifdef BZ_DEBUG_TRAVERSE
81cout << "traversalGridSize = " << traversalGridSize << endl;
82cout.flush();
83#endif
84
85 const TraversalOrder<N_rank-1>* order =
86 traversals.find(traversalGridSize);
87
88 if (order)
89 {
90#ifdef BZ_DEBUG_TRAVERSE
91 cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
92 << ", " << N_rank << ">: Using stack traversal" << endl;
93#endif
94 // A curve was available -- use fast traversal.
95 array.evaluateWithFastTraversal(*order, expr, T_update());
96 return _bz_true;
97 }
98
99 return _bz_false;
100 }
101};
102
103#endif // BZ_HAVE_STD
104
105template<class T_numtype, int N_rank> template<class T_expr, class T_update>
106inline Array<T_numtype, N_rank>&
107Array<T_numtype, N_rank>::evaluate(_bz_ArrayExpr<T_expr> expr, T_update)
108{
109 // Check that all arrays have the same shape
110#ifdef BZ_DEBUG
111 if (!expr.shapeCheck(shape()))
112 {
113 if (assertFailMode == _bz_false)
114 {
115 cerr << "[Blitz++] Shape check failed: Module " << __FILE__
116 << " line " << __LINE__ << endl
117 << " Expression: ";
118 prettyPrintFormat format(_bz_true); // Use terse formatting
119 string str;
120 expr.prettyPrint(str, format);
121 cerr << str << endl ;
122 }
123
124#if 0
125// Shape dumping is broken by change to using string for prettyPrint
126 << " Shapes: " << shape() << " = ";
127 prettyPrintFormat format2;
128 format2.setDumpArrayShapesMode();
129 expr.prettyPrint(cerr, format2);
130 cerr << endl;
131#endif
132 BZ_PRE_FAIL;
133 }
134#endif
135
136 BZPRECHECK(expr.shapeCheck(shape()),
137 "Shape check failed." << endl << "Expression:");
138
139 BZPRECHECK((T_expr::rank == N_rank) || (T_expr::numArrayOperands == 0),
140 "Assigned rank " << T_expr::rank << " expression to rank "
141 << N_rank << " array.");
142
143#ifdef BZ_DEBUG_TRAVERSE
144cout << "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders
145 << endl; cout.flush();
146#endif
147
148 // Tau profiling code. Provide Tau with a pretty-printed version of
149 // the expression.
150 // NEEDS_WORK-- use a static initializer somehow.
151
152#ifdef BZ_TAU_PROFILING
153 static string exprDescription;
154 if (!exprDescription.length()) // faked static initializer
155 {
156 exprDescription = "A";
157 prettyPrintFormat format(_bz_true); // Terse mode on
158 format.nextArrayOperandSymbol();
159 T_update::prettyPrint(exprDescription);
160 expr.prettyPrint(exprDescription, format);
161 }
162 TAU_PROFILE(" ", exprDescription, TAU_BLITZ);
163#endif
164
165 // Determine which evaluation mechanism to use
166 if (T_expr::numIndexPlaceholders > 0)
167 {
168 // The expression involves index placeholders, so have to
169 // use index traversal rather than stack traversal.
170
171 if (N_rank == 1)
172 return evaluateWithIndexTraversal1(expr, T_update());
173 else
174 return evaluateWithIndexTraversalN(expr, T_update());
175 }
176 else {
177
178 // If this expression looks like an array stencil, then attempt to
179 // use a fast traversal order.
180 // Fast traversals require <set> from the ISO/ANSI C++ standard
181 // library.
182
183#ifdef BZ_HAVE_STD
184
185 enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)
186 && (T_expr::numIndexPlaceholders == 0) };
187
188 if (_bz_tryFastTraversal<isStencil>::tryFast(*this, expr, T_update()))
189 return *this;
190
191#endif
192
193#ifdef BZ_ARRAY_2D_STENCIL_TILING
194 // Does this look like a 2-dimensional stencil on a largeish
195 // array?
196
197 if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))
198 {
199 // Use a heuristic to determine whether a tiled traversal
200 // is desirable. First, estimate how much L1 cache is needed
201 // to achieve a high hit rate using the stack traversal.
202 // Try to err on the side of using tiled traversal even when
203 // it isn't strictly needed.
204
205 // Assumptions:
206 // Stencil width 3
207 // 3 arrays involved in stencil
208 // Uniform data type in arrays (all T_numtype)
209
210 int cacheNeeded = 3 * 3 * sizeof(T_numtype) * length(ordering(0));
211 if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE)
212 return evaluateWithTiled2DTraversal(expr, T_update());
213 }
214
215#endif
216
217 // If fast traversal isn't available or appropriate, then just
218 // do a stack traversal.
219 if (N_rank == 1)
220 return evaluateWithStackTraversal1(expr, T_update());
221 else
222 return evaluateWithStackTraversalN(expr, T_update());
223 }
224}
225
226template<class T_numtype, int N_rank> template<class T_expr, class T_update>
227inline Array<T_numtype, N_rank>&
228Array<T_numtype, N_rank>::evaluateWithStackTraversal1(_bz_ArrayExpr<T_expr>
229 expr, T_update)
230{
231#ifdef BZ_DEBUG_TRAVERSE
232 BZ_DEBUG_MESSAGE("Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
233 << ", " << N_rank << ">: Using stack traversal");
234#endif
235 ArrayIterator<T_numtype, N_rank> iter(*this);
236 iter.loadStride(firstRank);
237 expr.loadStride(firstRank);
238
239 _bz_bool useUnitStride = iter.isUnitStride(firstRank)
240 && expr.isUnitStride(firstRank);
241
242#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
243 int commonStride = expr.suggestStride(firstRank);
244 if (iter.suggestStride(firstRank) > commonStride)
245 commonStride = iter.suggestStride(firstRank);
246 bool useCommonStride = iter.isStride(firstRank,commonStride)
247 && expr.isStride(firstRank,commonStride);
248
249 #ifdef BZ_DEBUG_TRAVERSE
250 BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE:" << endl
251 << " commonStride = " << commonStride << " useCommonStride = "
252 << useCommonStride);
253 #endif
254#else
255 int commonStride = 1;
256 bool useCommonStride = _bz_false;
257#endif
258
259 const T_numtype * last = iter.data() + length(firstRank)
260 * stride(firstRank);
261
262 if (useUnitStride || useCommonStride)
263 {
264#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
265
266#ifdef BZ_DEBUG_TRAVERSE
267 BZ_DEBUG_MESSAGE("BZ_USE_FAST_READ_ARRAY_EXPR with commonStride");
268#endif
269 int ubound = length(firstRank) * commonStride;
270 T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
271
272 if (commonStride == 1)
273 {
274 #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
275 for (int i=0; i < ubound; ++i)
276 T_update::update(data[i], expr.fastRead(i));
277 #else
278 int n1 = ubound & 3;
279 int i = 0;
280 for (; i < n1; ++i)
281 T_update::update(data[i], expr.fastRead(i));
282
283 for (; i < ubound; i += 4)
284 {
285#ifndef BZ_ARRAY_STACK_TRAVERSAL_CSE_AND_ANTIALIAS
286 T_update::update(data[i], expr.fastRead(i));
287 T_update::update(data[i+1], expr.fastRead(i+1));
288 T_update::update(data[i+2], expr.fastRead(i+2));
289 T_update::update(data[i+3], expr.fastRead(i+3));
290#else
291 int t1 = i+1;
292 int t2 = i+2;
293 int t3 = i+3;
294
295 _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3, tmp4;
296
297 tmp1 = expr.fastRead(i);
298 tmp2 = expr.fastRead(BZ_NO_PROPAGATE(t1));
299 tmp3 = expr.fastRead(BZ_NO_PROPAGATE(t2));
300 tmp4 = expr.fastRead(BZ_NO_PROPAGATE(t3));
301
302 T_update::update(data[i], BZ_NO_PROPAGATE(tmp1));
303 T_update::update(data[BZ_NO_PROPAGATE(t1)], tmp2);
304 T_update::update(data[BZ_NO_PROPAGATE(t2)], tmp3);
305 T_update::update(data[BZ_NO_PROPAGATE(t3)], tmp4);
306#endif
307 }
308 #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
309
310 }
311 #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
312 else {
313
314 #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
315 for (int i=0; i < ubound; i += commonStride)
316 T_update::update(data[i], expr.fastRead(i));
317 #else
318 int n1 = (length(firstRank) & 3) * commonStride;
319
320 int i = 0;
321 for (; i < n1; i += commonStride)
322 T_update::update(data[i], expr.fastRead(i));
323
324 int strideInc = 4 * commonStride;
325 for (; i < ubound; i += strideInc)
326 {
327 T_update::update(data[i], expr.fastRead(i));
328 int i2 = i + commonStride;
329 T_update::update(data[i2], expr.fastRead(i2));
330 int i3 = i + 2 * commonStride;
331 T_update::update(data[i3], expr.fastRead(i3));
332 int i4 = i + 3 * commonStride;
333 T_update::update(data[i4], expr.fastRead(i4));
334 }
335 #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
336 }
337 #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
338
339#else // ! BZ_USE_FAST_READ_ARRAY_EXPR
340
341#ifdef BZ_DEBUG_TRAVERSE
342 BZ_DEBUG_MESSAGE("Common stride, no fast read");
343#endif
344 while (iter.data() != last)
345 {
346 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
347 iter.advance(commonStride);
348 expr.advance(commonStride);
349 }
350#endif
351 }
352 else {
353 while (iter.data() != last)
354 {
355 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
356 iter.advance();
357 expr.advance();
358 }
359 }
360
361 return *this;
362}
363
364template<class T_numtype, int N_rank> template<class T_expr, class T_update>
365inline Array<T_numtype, N_rank>&
366Array<T_numtype, N_rank>::evaluateWithStackTraversalN(_bz_ArrayExpr<T_expr>
367 expr, T_update)
368{
369 /*
370 * A stack traversal replaces the usual nested loops:
371 *
372 * for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
373 * for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
374 * for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
375 * A(i,j,k) = 0;
376 *
377 * with a stack data structure. The stack allows this single
378 * routine to replace any number of nested loops.
379 *
380 * For each dimension (loop), these quantities are needed:
381 * - a pointer to the first element encountered in the loop
382 * - the stride associated with the dimension/loop
383 * - a pointer to the last element encountered in the loop
384 *
385 * The basic idea is that entering each loop is a "push" onto the
386 * stack, and exiting each loop is a "pop". In practice, this
387 * routine treats accesses the stack in a random-access way,
388 * which confuses the picture a bit. But conceptually, that's
389 * what is going on.
390 */
391
392 /*
393 * ordering(0) gives the dimension associated with the smallest
394 * stride (usually; the exceptions have to do with subarrays and
395 * are uninteresting). We call this dimension maxRank; it will
396 * become the innermost "loop".
397 *
398 * Ordering the loops from ordering(N_rank-1) down to
399 * ordering(0) ensures that the largest stride is associated
400 * with the outermost loop, and the smallest stride with the
401 * innermost. This is critical for good performance on
402 * cached machines.
403 */
404 const int maxRank = ordering(0);
405 const int secondLastRank = ordering(1);
406
407 // Create an iterator for the array receiving the result
408 ArrayIterator<T_numtype, N_rank> iter(*this);
409
410 // Set the initial stack configuration by pushing the pointer
411 // to the first element of the array onto the stack N times.
412
413 int i;
414 for (i=1; i < N_rank; ++i)
415 {
416 iter.push(i);
417 expr.push(i);
418 }
419
420 // Load the strides associated with the innermost loop.
421 iter.loadStride(maxRank);
422 expr.loadStride(maxRank);
423
424 /*
425 * Is the stride in the innermost loop equal to 1? If so,
426 * we might take advantage of this and generate more
427 * efficient code.
428 */
429 _bz_bool useUnitStride = iter.isUnitStride(maxRank)
430 && expr.isUnitStride(maxRank);
431
432 /*
433 * Do all array operands share a common stride in the innermost
434 * loop? If so, we can generate more efficient code (but only
435 * if this optimization has been enabled).
436 */
437#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
438 int commonStride = expr.suggestStride(maxRank);
439 if (iter.suggestStride(maxRank) > commonStride)
440 commonStride = iter.suggestStride(maxRank);
441 bool useCommonStride = iter.isStride(maxRank,commonStride)
442 && expr.isStride(maxRank,commonStride);
443
444#ifdef BZ_DEBUG_TRAVERSE
445 BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl
446 << "commonStride = " << commonStride << " useCommonStride = "
447 << useCommonStride);
448#endif
449
450#else
451 int commonStride = 1;
452 bool useCommonStride = _bz_false;
453#endif
454
455 /*
456 * The "last" array contains a pointer to the last element
457 * encountered in each "loop".
458 */
459 const T_numtype* _bz_restrict last[N_rank];
460
461 // Set up the initial state of the "last" array
462 for (i=1; i < N_rank; ++i)
463 last[i] = iter.data() + length(ordering(i)) * stride(ordering(i));
464
465 int lastLength = length(maxRank);
466 int firstNoncollapsedLoop = 1;
467
468#ifdef BZ_COLLAPSE_LOOPS
469
470 /*
471 * This bit of code handles collapsing loops. When possible,
472 * the N nested loops are converted into a single loop (basically,
473 * the N-dimensional array is treated as a long vector).
474 * This is important for cases where the length of the innermost
475 * loop is very small, for example a 100x100x3 array.
476 * If this code can't collapse all the loops into a single loop,
477 * it will collapse as many loops as possible starting from the
478 * innermost and working out.
479 */
480
481 // Collapse loops when possible
482 for (i=1; i < N_rank; ++i)
483 {
484 // Figure out which pair of loops we are considering combining.
485 int outerLoopRank = ordering(i);
486 int innerLoopRank = ordering(i-1);
487
488 /*
489 * The canCollapse() routines look at the strides and extents
490 * of the loops, and determine if they can be combined into
491 * one loop.
492 */
493
494 if (canCollapse(outerLoopRank,innerLoopRank)
495 && expr.canCollapse(outerLoopRank,innerLoopRank))
496 {
497 lastLength *= length(outerLoopRank);
498 firstNoncollapsedLoop = i+1;
499 }
500 }
501#endif // BZ_COLLAPSE_LOOPS
502
503 /*
504 * Now we actually perform the loops. This while loop contains
505 * two parts: first, the innermost loop is performed. Then we
506 * exit the loop, and pop our way down the stack until we find
507 * a loop that isn't completed. We then restart the inner loops
508 * and push them onto the stack.
509 */
510
511 while (true) {
512
513 /*
514 * This bit of code handles the innermost loop. It would look
515 * a lot simpler if it weren't for unit stride and common stride
516 * optimizations; these clutter up the code with multiple versions.
517 */
518
519 if ((useUnitStride) || (useCommonStride))
520 {
521 T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
522 + lastLength;
523
524#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
525
526 /*
527 * The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably
528 * be taken out. This was put in place while the unit stride/
529 * common stride optimizations were being implemented and
530 * tested.
531 */
532
533 // Calculate the end of the innermost loop
534 int ubound = lastLength * commonStride;
535
536 /*
537 * This is a real kludge. I didn't want to have to write
538 * a const and non-const version of ArrayIterator, so I use a
539 * const iterator and cast away const. This could
540 * probably be avoided with some trick, but the whole routine
541 * is ugly, so why bother.
542 */
543
544 T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
545
546 /*
547 * BZ_NEEDS_WORK-- need to implement optional unrolling.
548 */
549 if (commonStride == 1)
550 {
551 for (int i=0; i < ubound; ++i)
552 T_update::update(data[i], expr.fastRead(i));
553 }
554#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
555 else {
556 for (int i=0; i < ubound; i += commonStride)
557 T_update::update(data[i], expr.fastRead(i));
558 }
559#endif
560 /*
561 * Tidy up for the fact that we haven't actually been
562 * incrementing the iterators in the innermost loop, by
563 * faking it afterward.
564 */
565 iter.advance(lastLength * commonStride);
566 expr.advance(lastLength * commonStride);
567#else
568 // !BZ_USE_FAST_READ_ARRAY_EXPR
569 // This bit of code not really needed; should remove at some
570 // point, along with the test for BZ_USE_FAST_READ_ARRAY_EXPR
571
572 while (iter.data() != end)
573 {
574 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
575 iter.advance(commonStride);
576 expr.advance(commonStride);
577 }
578#endif
579 }
580 else {
581 /*
582 * We don't have a unit stride or common stride in the innermost
583 * loop. This is going to hurt performance. Luckily 95% of
584 * the time, we hit the cases above.
585 */
586 T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
587 + lastLength * stride(maxRank);
588
589 while (iter.data() != end)
590 {
591 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
592 iter.advance();
593 expr.advance();
594 }
595 }
596
597
598 /*
599 * We just finished the innermost loop. Now we pop our way down
600 * the stack, until we hit a loop that hasn't completed yet.
601 */
602 int j = firstNoncollapsedLoop;
603 for (; j < N_rank; ++j)
604 {
605 // Get the next loop
606 int r = ordering(j);
607
608 // Pop-- this restores the data pointers to the first element
609 // encountered in the loop.
610 iter.pop(j);
611 expr.pop(j);
612
613 // Load the stride associated with this loop, and increment
614 // once.
615 iter.loadStride(r);
616 expr.loadStride(r);
617 iter.advance();
618 expr.advance();
619
620 // If we aren't at the end of this loop, then stop popping.
621 if (iter.data() != last[j])
622 break;
623 }
624
625 // Are we completely done?
626 if (j == N_rank)
627 break;
628
629 // No, so push all the inner loops back onto the stack.
630 for (; j >= firstNoncollapsedLoop; --j)
631 {
632 int r2 = ordering(j-1);
633 iter.push(j);
634 expr.push(j);
635 last[j-1] = iter.data() + length(r2) * stride(r2);
636 }
637
638 // Load the stride for the innermost loop again.
639 iter.loadStride(maxRank);
640 expr.loadStride(maxRank);
641 }
642
643 return *this;
644}
645
646template<class T_numtype, int N_rank> template<class T_expr, class T_update>
647inline Array<T_numtype, N_rank>&
648Array<T_numtype, N_rank>::evaluateWithIndexTraversal1(_bz_ArrayExpr<T_expr>
649 expr, T_update)
650{
651 TinyVector<int,N_rank> index;
652
653 if (stride(firstRank) == 1)
654 {
655 T_numtype * _bz_restrict iter = data_;
656 int last = ubound(firstRank);
657
658 for (index[0] = lbound(firstRank); index[0] <= last;
659 ++index[0])
660 {
661 iter[index[0]] = expr(index);
662 }
663 }
664 else {
665 ArrayIterator<T_numtype, N_rank> iter(*this);
666 iter.loadStride(0);
667 int last = ubound(firstRank);
668
669 for (index[0] = lbound(firstRank); index[0] <= last;
670 ++index[0])
671 {
672 T_update::update(*const_cast<T_numtype*>(iter.data()),
673 expr(index));
674 iter.advance();
675 }
676 }
677
678 return *this;
679}
680
681template<class T_numtype, int N_rank> template<class T_expr, class T_update>
682inline Array<T_numtype, N_rank>&
683Array<T_numtype, N_rank>::evaluateWithIndexTraversalN(_bz_ArrayExpr<T_expr>
684 expr, T_update)
685{
686 // Do a stack-type traversal for the destination array and use
687 // index traversal for the source expression
688
689 const int maxRank = ordering(0);
690 const int secondLastRank = ordering(1);
691
692#ifdef BZ_DEBUG_TRAVERSE
693cout << "Index traversal: N_rank = " << N_rank << endl;
694cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
695 << endl;
696cout.flush();
697#endif
698
699 ArrayIterator<T_numtype, N_rank> iter(*this);
700 for (int i=1; i < N_rank; ++i)
701 iter.push(ordering(i));
702
703 iter.loadStride(maxRank);
704
705 TinyVector<int,N_rank> index, last;
706
707 index = storage_.base();
708 last = storage_.base() + length_;
709
710 int lastLength = length(maxRank);
711
712 while (true) {
713
714 for (index[maxRank] = base(maxRank);
715 index[maxRank] < last[maxRank];
716 ++index[maxRank])
717 {
718#ifdef BZ_DEBUG_TRAVERSE
719#if 0
720 cout << "(" << index[0] << "," << index[1] << ") " << endl;
721 cout.flush();
722#endif
723#endif
724
725 T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index));
726 iter.advance();
727 }
728
729 int j = 1;
730 for (; j < N_rank; ++j)
731 {
732 iter.pop(ordering(j));
733 iter.loadStride(ordering(j));
734 iter.advance();
735
736 index[ordering(j-1)] = base(ordering(j-1));
737 ++index[ordering(j)];
738 if (index[ordering(j)] != last[ordering(j)])
739 break;
740 }
741
742 if (j == N_rank)
743 break;
744
745 for (; j > 0; --j)
746 {
747 iter.push(ordering(j));
748 }
749 iter.loadStride(maxRank);
750 }
751
752 return *this;
753}
754
755// Fast traversals require <set> from the ISO/ANSI C++ standard library
756
757#ifdef BZ_HAVE_STD
758
759template<class T_numtype, int N_rank> template<class T_expr, class T_update>
760inline Array<T_numtype, N_rank>&
761Array<T_numtype, N_rank>::evaluateWithFastTraversal(
762 const TraversalOrder<N_rank - 1>& order, _bz_ArrayExpr<T_expr> expr,
763 T_update)
764{
765 const int maxRank = ordering(0);
766 const int secondLastRank = ordering(1);
767
768#ifdef BZ_DEBUG_TRAVERSE
769cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
770 << endl;
771#endif
772
773 ArrayIterator<T_numtype, N_rank> iter(*this);
774 iter.push(0);
775 expr.push(0);
776
777 _bz_bool useUnitStride = iter.isUnitStride(maxRank)
778 && expr.isUnitStride(maxRank);
779
780#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
781 int commonStride = expr.suggestStride(maxRank);
782 if (iter.suggestStride(maxRank) > commonStride)
783 commonStride = iter.suggestStride(maxRank);
784 bool useCommonStride = iter.isStride(maxRank,commonStride)
785 && expr.isStride(maxRank,commonStride);
786#else
787 int commonStride = 1;
788 bool useCommonStride = _bz_false;
789#endif
790
791 int lastLength = length(maxRank);
792
793 for (int i=0; i < order.length(); ++i)
794 {
795 iter.pop(0);
796 expr.pop(0);
797
798#ifdef BZ_DEBUG_TRAVERSE
799 cerr << "Traversing: " << order[i] << endl;
800#endif
801 // Position the iterator at the start of the next column
802 for (int j=1; j < N_rank; ++j)
803 {
804 iter.loadStride(ordering(j));
805 expr.loadStride(ordering(j));
806
807 int offset = order[i][j-1];
808 iter.advance(offset);
809 expr.advance(offset);
810 }
811
812 iter.loadStride(maxRank);
813 expr.loadStride(maxRank);
814
815 // Evaluate the expression along the column
816
817 if ((useUnitStride) || (useCommonStride))
818 {
819 T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
820 + lastLength * commonStride;
821
822#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
823 int ubound = lastLength * commonStride;
824 T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
825
826 if (commonStride == 1)
827 {
828 #ifndef BZ_ARRAY_FAST_TRAVERSAL_UNROLL
829 for (int i=0; i < ubound; ++i)
830 T_update::update(data[i], expr.fastRead(i));
831 #else
832 int n1 = ubound & 3;
833 int i=0;
834 for (; i < n1; ++i)
835 T_update::update(data[i], expr.fastRead(i));
836
837 for (; i < ubound; i += 4)
838 {
839 T_update::update(data[i], expr.fastRead(i));
840 T_update::update(data[i+1], expr.fastRead(i+1));
841 T_update::update(data[i+2], expr.fastRead(i+2));
842 T_update::update(data[i+3], expr.fastRead(i+3));
843 }
844 #endif // BZ_ARRAY_FAST_TRAVERSAL_UNROLL
845 }
846 #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
847 else {
848 for (int i=0; i < ubound; i += commonStride)
849 T_update::update(data[i], expr.fastRead(i));
850 }
851 #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
852
853 iter.advance(lastLength * commonStride);
854 expr.advance(lastLength * commonStride);
855#else // ! BZ_USE_FAST_READ_ARRAY_EXPR
856 while (iter.data() != last)
857 {
858 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
859 iter.advance(commonStride);
860 expr.advance(commonStride);
861 }
862#endif // BZ_USE_FAST_READ_ARRAY_EXPR
863
864 }
865 else {
866 // No common stride
867
868 T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
869 + lastLength * stride(maxRank);
870
871 while (iter.data() != last)
872 {
873 T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
874 iter.advance();
875 expr.advance();
876 }
877 }
878 }
879
880 return *this;
881}
882#endif // BZ_HAVE_STD
883
884#ifdef BZ_ARRAY_2D_NEW_STENCIL_TILING
885
886#ifdef BZ_ARRAY_2D_STENCIL_TILING
887
888template<class T_numtype, int N_rank> template<class T_expr, class T_update>
889inline Array<T_numtype, N_rank>&
890Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
891 expr, T_update)
892{
893 const int minorRank = ordering(0);
894 const int majorRank = ordering(1);
895
896 ArrayIterator<T_numtype, N_rank> iter(*this);
897 iter.push(0);
898 expr.push(0);
899
900#ifdef BZ_2D_STENCIL_DEBUG
901 int count = 0;
902#endif
903
904 _bz_bool useUnitStride = iter.isUnitStride(minorRank)
905 && expr.isUnitStride(minorRank);
906
907#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
908 int commonStride = expr.suggestStride(minorRank);
909 if (iter.suggestStride(minorRank) > commonStride)
910 commonStride = iter.suggestStride(minorRank);
911 bool useCommonStride = iter.isStride(minorRank,commonStride)
912 && expr.isStride(minorRank,commonStride);
913#else
914 int commonStride = 1;
915 bool useCommonStride = _bz_false;
916#endif
917
918 // Determine if a common major stride exists
919 int commonMajorStride = expr.suggestStride(majorRank);
920 if (iter.suggestStride(majorRank) > commonMajorStride)
921 commonMajorStride = iter.suggestStride(majorRank);
922 bool haveCommonMajorStride = iter.isStride(majorRank,commonMajorStride)
923 && expr.isStride(majorRank,commonMajorStride);
924
925
926 int maxi = length(majorRank);
927 int maxj = length(minorRank);
928
929 const int tileHeight = 16, tileWidth = 3;
930
931 int bi, bj;
932 for (bi=0; bi < maxi; bi += tileHeight)
933 {
934 int ni = bi + tileHeight;
935 if (ni > maxi)
936 ni = maxi;
937
938 // Move back to the beginning of the array
939 iter.pop(0);
940 expr.pop(0);
941
942 // Move to the start of this tile row
943 iter.loadStride(majorRank);
944 iter.advance(bi);
945 expr.loadStride(majorRank);
946 expr.advance(bi);
947
948 // Save this position
949 iter.push(1);
950 expr.push(1);
951
952 for (bj=0; bj < maxj; bj += tileWidth)
953 {
954 // Move to the beginning of the tile row
955 iter.pop(1);
956 expr.pop(1);
957
958 // Move to the top of the current tile (bi,bj)
959 iter.loadStride(minorRank);
960 iter.advance(bj);
961 expr.loadStride(minorRank);
962 expr.advance(bj);
963
964 if (bj + tileWidth <= maxj)
965 {
966 // Strip mining
967
968 if ((useUnitStride) && (haveCommonMajorStride))
969 {
970 int offset = 0;
971 T_numtype* _bz_restrict data = const_cast<T_numtype*>
972 (iter.data());
973
974 for (int i=bi; i < ni; ++i)
975 {
976 _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3;
977
978 // Common subexpression elimination -- compilers
979 // won't necessarily do this on their own.
980 int t1 = offset+1;
981 int t2 = offset+2;
982
983 tmp1 = expr.fastRead(offset);
984 tmp2 = expr.fastRead(t1);
985 tmp3 = expr.fastRead(t2);
986
987 T_update::update(data[0], tmp1);
988 T_update::update(data[1], tmp2);
989 T_update::update(data[2], tmp3);
990
991 offset += commonMajorStride;
992 data += commonMajorStride;
993
994#ifdef BZ_2D_STENCIL_DEBUG
995 count += 3;
996#endif
997 }
998 }
999 else {
1000
1001 for (int i=bi; i < ni; ++i)
1002 {
1003 iter.loadStride(minorRank);
1004 expr.loadStride(minorRank);
1005
1006 // Loop through current row elements
1007 T_update::update(*const_cast<T_numtype*>(iter.data()),
1008 *expr);
1009 iter.advance();
1010 expr.advance();
1011
1012 T_update::update(*const_cast<T_numtype*>(iter.data()),
1013 *expr);
1014 iter.advance();
1015 expr.advance();
1016
1017 T_update::update(*const_cast<T_numtype*>(iter.data()),
1018 *expr);
1019 iter.advance(-2);
1020 expr.advance(-2);
1021
1022 iter.loadStride(majorRank);
1023 expr.loadStride(majorRank);
1024 iter.advance();
1025 expr.advance();
1026
1027#ifdef BZ_2D_STENCIL_DEBUG
1028 count += 3;
1029#endif
1030
1031 }
1032 }
1033 }
1034 else {
1035
1036 // This code handles partial tiles at the bottom of the
1037 // array.
1038
1039 for (int j=bj; j < maxj; ++j)
1040 {
1041 iter.loadStride(majorRank);
1042 expr.loadStride(majorRank);
1043
1044 for (int i=bi; i < ni; ++i)
1045 {
1046 T_update::update(*const_cast<T_numtype*>(iter.data()),
1047 *expr);
1048 iter.advance();
1049 expr.advance();
1050#ifdef BZ_2D_STENCIL_DEBUG
1051 ++count;
1052#endif
1053
1054 }
1055
1056 // Move back to the top of this column
1057 iter.advance(bi-ni);
1058 expr.advance(bi-ni);
1059
1060 // Move over to the next column
1061 iter.loadStride(minorRank);
1062 expr.loadStride(minorRank);
1063
1064 iter.advance();
1065 expr.advance();
1066 }
1067 }
1068 }
1069 }
1070
1071#ifdef BZ_2D_STENCIL_DEBUG
1072 cout << "BZ_2D_STENCIL_DEBUG: count = " << count << endl;
1073#endif
1074
1075 return *this;
1076}
1077
1078#endif // BZ_ARRAY_2D_STENCIL_TILING
1079#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
1080
1081
1082
1083#ifndef BZ_ARRAY_2D_NEW_STENCIL_TILING
1084
1085#ifdef BZ_ARRAY_2D_STENCIL_TILING
1086
1087template<class T_numtype, int N_rank> template<class T_expr, class T_update>
1088inline Array<T_numtype, N_rank>&
1089Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
1090 expr, T_update)
1091{
1092 const int minorRank = ordering(0);
1093 const int majorRank = ordering(1);
1094
1095 const int blockSize = 16;
1096
1097 ArrayIterator<T_numtype, N_rank> iter(*this);
1098 iter.push(0);
1099 expr.push(0);
1100
1101 _bz_bool useUnitStride = iter.isUnitStride(minorRank)
1102 && expr.isUnitStride(minorRank);
1103
1104#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1105 int commonStride = expr.suggestStride(minorRank);
1106 if (iter.suggestStride(minorRank) > commonStride)
1107 commonStride = iter.suggestStride(minorRank);
1108 bool useCommonStride = iter.isStride(minorRank,commonStride)
1109 && expr.isStride(minorRank,commonStride);
1110#else
1111 int commonStride = 1;
1112 bool useCommonStride = _bz_false;
1113#endif
1114
1115 int maxi = length(majorRank);
1116 int maxj = length(minorRank);
1117
1118 int bi, bj;
1119 for (bi=0; bi < maxi; bi += blockSize)
1120 {
1121 int ni = bi + blockSize;
1122 if (ni > maxi)
1123 ni = maxi;
1124
1125 for (bj=0; bj < maxj; bj += blockSize)
1126 {
1127 int nj = bj + blockSize;
1128 if (nj > maxj)
1129 nj = maxj;
1130
1131 // Move to the beginning of the array
1132 iter.pop(0);
1133 expr.pop(0);
1134
1135 // Move to the beginning of the tile (bi,bj)
1136 iter.loadStride(majorRank);
1137 iter.advance(bi);
1138 iter.loadStride(minorRank);
1139 iter.advance(bj);
1140
1141 expr.loadStride(majorRank);
1142 expr.advance(bi);
1143 expr.loadStride(minorRank);
1144 expr.advance(bj);
1145
1146 // Loop through tile rows
1147 for (int i=bi; i < ni; ++i)
1148 {
1149 // Save the beginning of this tile row
1150 iter.push(1);
1151 expr.push(1);
1152
1153 // Load the minor stride
1154 iter.loadStride(minorRank);
1155 expr.loadStride(minorRank);
1156
1157 if (useUnitStride)
1158 {
1159 T_numtype* _bz_restrict data = const_cast<T_numtype*>
1160 (iter.data());
1161
1162 int ubound = (nj-bj);
1163 for (int j=0; j < ubound; ++j)
1164 T_update::update(data[j], expr.fastRead(j));
1165 }
1166#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1167 else if (useCommonStride)
1168 {
1169 int ubound = (nj-bj) * commonStride;
1170 T_numtype* _bz_restrict data = const_cast<T_numtype*>
1171 (iter.data());
1172
1173 for (int j=0; j < ubound; j += commonStride)
1174 T_update::update(data[j], expr.fastRead(j));
1175 }
1176#endif
1177 else {
1178 for (int j=bj; j < nj; ++j)
1179 {
1180 // Loop through current row elements
1181 T_update::update(*const_cast<T_numtype*>(iter.data()),
1182 *expr);
1183 iter.advance();
1184 expr.advance();
1185 }
1186 }
1187
1188 // Move back to the beginning of the tile row, then
1189 // move to the next row
1190 iter.pop(1);
1191 iter.loadStride(majorRank);
1192 iter.advance(1);
1193
1194 expr.pop(1);
1195 expr.loadStride(majorRank);
1196 expr.advance(1);
1197 }
1198 }
1199 }
1200
1201 return *this;
1202}
1203#endif // BZ_ARRAY_2D_STENCIL_TILING
1204#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
1205
1206BZ_NAMESPACE_END
1207
1208#endif // BZ_ARRAYEVAL_CC
1209
Note: See TracBrowser for help on using the repository browser.