Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

eval.cc@ 3477

Visit:

Last change on this file since 3477 was 221, checked in by ansari, 26 years ago
Creation module DPC/Blitz (blitz 0.4) Reza 09/04/99
File size: 37.6 KB

Line
1	/*
2	* $Id: eval.cc,v 1.1.1.1 1999-04-09 17:59:03 ansari Exp $
3	*
4	* $Log: not supported by cvs2svn $
5	* Revision 1.2 1998/03/14 00:04:47 tveldhui
6	* 0.2-alpha-05
7	*
8	* Revision 1.1 1998/02/25 20:04:01 tveldhui
9	* Initial revision
10	*
11	*/
12
13	#ifndef BZ_ARRAYEVAL_CC
14	#define BZ_ARRAYEVAL_CC
15
16	#ifndef BZ_ARRAY_H
17	#error <blitz/array/eval.cc> must be included via <blitz/array.h>
18	#endif
19
20	BZ_NAMESPACE(blitz)
21
22	/*
23	* Assign an expression to an array. For performance reasons, there are
24	* several traversal mechanisms:
25	*
26	* - Index traversal scans through the destination array in storage order.
27	* The expression is evaluated using a TinyVector<int,N> operand. This
28	* version is used only when there are index placeholders in the expression
29	* (see <blitz/indexexpr.h>)
30	* - Stack traversal also scans through the destination array in storage
31	* order. However, push/pop stack iterators are used.
32	* - Fast traversal follows a Hilbert (or other) space-filling curve to
33	* improve cache reuse for stencilling operations. Currently, the
34	* space filling curves must be generated by calling
35	* generateFastTraversalOrder(TinyVector<int,N_dimensions>).
36	* - 2D tiled traversal follows a tiled traversal, to improve cache reuse
37	* for 2D stencils. Space filling curves have too much overhead to use
38	* in two-dimensions.
39	*
40	* _bz_tryFastTraversal is a helper class. Fast traversals are only
41	* attempted if the expression looks like a stencil -- it's at least
42	* three-dimensional, has at least six array operands, and there are
43	* no index placeholders in the expression. These are all things which
44	* can be checked at compile time, so the if()/else() syntax has been
45	* replaced with this class template.
46	*/
47
48	// Fast traversals require <set> from the ISO/ANSI C++ standard library
49	#ifdef BZ_HAVE_STD
50
51	template<_bz_bool canTryFastTraversal>
52	struct _bz_tryFastTraversal {
53	template<class T_numtype, int N_rank, class T_expr, class T_update>
54	static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
55	_bz_ArrayExpr<T_expr> expr, T_update)
56	{
57	return _bz_false;
58	}
59	};
60
61	template<>
62	struct _bz_tryFastTraversal<_bz_true> {
63	template<class T_numtype, int N_rank, class T_expr, class T_update>
64	static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
65	_bz_ArrayExpr<T_expr> expr, T_update)
66	{
67	// See if there's an appropriate space filling curve available.
68	// Currently fast traversals use an N-1 dimensional curve. The
69	// Nth dimension column corresponding to each point on the curve
70	// is traversed in the normal fashion.
71	TraversalOrderCollection<N_rank-1> traversals;
72	TinyVector<int, N_rank - 1> traversalGridSize;
73
74	for (int i=0; i < N_rank - 1; ++i)
75	traversalGridSize[i] = array.length(array.ordering(i+1));
76
77	#ifdef BZ_DEBUG_TRAVERSE
78	cout << "traversalGridSize = " << traversalGridSize << endl;
79	cout.flush();
80	#endif
81
82	const TraversalOrder<N_rank-1>* order =
83	traversals.find(traversalGridSize);
84
85	if (order)
86	{
87	#ifdef BZ_DEBUG_TRAVERSE
88	cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
89	<< ", " << N_rank << ">: Using stack traversal" << endl;
90	#endif
91	// A curve was available -- use fast traversal.
92	array.evaluateWithFastTraversal(*order, expr, T_update());
93	return _bz_true;
94	}
95
96	return _bz_false;
97	}
98	};
99
100	#endif // BZ_HAVE_STD
101
102	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
103	inline Array<T_numtype, N_rank>&
104	Array<T_numtype, N_rank>::evaluate(_bz_ArrayExpr<T_expr> expr, T_update)
105	{
106	// Check that all arrays have the same shape
107	#ifdef BZ_DEBUG
108	if (!expr.shapeCheck(shape()))
109	{
110	if (assertFailMode == _bz_false)
111	{
112	cerr << "[Blitz++] Shape check failed: Module " << __FILE__
113	<< " line " << __LINE__ << endl
114	<< " Expression: ";
115	prettyPrintFormat format(_bz_true); // Use terse formatting
116	string str;
117	expr.prettyPrint(str, format);
118	cerr << str << endl ;
119	}
120
121	#if 0
122	// Shape dumping is broken by change to using string for prettyPrint
123	<< " Shapes: " << shape() << " = ";
124	prettyPrintFormat format2;
125	format2.setDumpArrayShapesMode();
126	expr.prettyPrint(cerr, format2);
127	cerr << endl;
128	#endif
129	BZ_PRE_FAIL;
130	}
131	#endif
132
133	BZPRECHECK(expr.shapeCheck(shape()),
134	"Shape check failed." << endl << "Expression:");
135
136	BZPRECHECK((T_expr::rank == N_rank) \|\| (T_expr::numArrayOperands == 0),
137	"Assigned rank " << T_expr::rank << " expression to rank "
138	<< N_rank << " array.");
139
140	#ifdef BZ_DEBUG_TRAVERSE
141	cout << "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders
142	<< endl; cout.flush();
143	#endif
144
145	// Tau profiling code. Provide Tau with a pretty-printed version of
146	// the expression.
147	// NEEDS_WORK-- use a static initializer somehow.
148
149	#ifdef BZ_TAU_PROFILING
150	static string exprDescription;
151	if (!exprDescription.length()) // faked static initializer
152	{
153	exprDescription = "A";
154	prettyPrintFormat format(_bz_true); // Terse mode on
155	format.nextArrayOperandSymbol();
156	T_update::prettyPrint(exprDescription);
157	expr.prettyPrint(exprDescription, format);
158	}
159	TAU_PROFILE(" ", exprDescription, TAU_BLITZ);
160	#endif
161
162	// Determine which evaluation mechanism to use
163	if (T_expr::numIndexPlaceholders > 0)
164	{
165	// The expression involves index placeholders, so have to
166	// use index traversal rather than stack traversal.
167
168	if (N_rank == 1)
169	return evaluateWithIndexTraversal1(expr, T_update());
170	else
171	return evaluateWithIndexTraversalN(expr, T_update());
172	}
173	else {
174
175	// If this expression looks like an array stencil, then attempt to
176	// use a fast traversal order.
177	// Fast traversals require <set> from the ISO/ANSI C++ standard
178	// library.
179
180	#ifdef BZ_HAVE_STD
181
182	enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)
183	&& (T_expr::numIndexPlaceholders == 0) };
184
185	if (_bz_tryFastTraversal<isStencil>::tryFast(*this, expr, T_update()))
186	return *this;
187
188	#endif
189
190	#ifdef BZ_ARRAY_2D_STENCIL_TILING
191	// Does this look like a 2-dimensional stencil on a largeish
192	// array?
193
194	if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))
195	{
196	// Use a heuristic to determine whether a tiled traversal
197	// is desirable. First, estimate how much L1 cache is needed
198	// to achieve a high hit rate using the stack traversal.
199	// Try to err on the side of using tiled traversal even when
200	// it isn't strictly needed.
201
202	// Assumptions:
203	// Stencil width 3
204	// 3 arrays involved in stencil
205	// Uniform data type in arrays (all T_numtype)
206
207	int cacheNeeded = 3 * 3 * sizeof(T_numtype) * length(ordering(0));
208	if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE)
209	return evaluateWithTiled2DTraversal(expr, T_update());
210	}
211
212	#endif
213
214	// If fast traversal isn't available or appropriate, then just
215	// do a stack traversal.
216	if (N_rank == 1)
217	return evaluateWithStackTraversal1(expr, T_update());
218	else
219	return evaluateWithStackTraversalN(expr, T_update());
220	}
221	}
222
223	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
224	inline Array<T_numtype, N_rank>&
225	Array<T_numtype, N_rank>::evaluateWithStackTraversal1(_bz_ArrayExpr<T_expr>
226	expr, T_update)
227	{
228	#ifdef BZ_DEBUG_TRAVERSE
229	BZ_DEBUG_MESSAGE("Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
230	<< ", " << N_rank << ">: Using stack traversal");
231	#endif
232	ArrayIterator<T_numtype, N_rank> iter(*this);
233	iter.loadStride(firstRank);
234	expr.loadStride(firstRank);
235
236	_bz_bool useUnitStride = iter.isUnitStride(firstRank)
237	&& expr.isUnitStride(firstRank);
238
239	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
240	int commonStride = expr.suggestStride(firstRank);
241	if (iter.suggestStride(firstRank) > commonStride)
242	commonStride = iter.suggestStride(firstRank);
243	bool useCommonStride = iter.isStride(firstRank,commonStride)
244	&& expr.isStride(firstRank,commonStride);
245
246	#ifdef BZ_DEBUG_TRAVERSE
247	BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE:" << endl
248	<< " commonStride = " << commonStride << " useCommonStride = "
249	<< useCommonStride);
250	#endif
251	#else
252	int commonStride = 1;
253	bool useCommonStride = _bz_false;
254	#endif
255
256	const T_numtype * last = iter.data() + length(firstRank)
257	* stride(firstRank);
258
259	if (useUnitStride \|\| useCommonStride)
260	{
261	#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
262
263	#ifdef BZ_DEBUG_TRAVERSE
264	BZ_DEBUG_MESSAGE("BZ_USE_FAST_READ_ARRAY_EXPR with commonStride");
265	#endif
266	int ubound = length(firstRank) * commonStride;
267	T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
268
269	if (commonStride == 1)
270	{
271	#ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
272	for (int i=0; i < ubound; ++i)
273	T_update::update(data[i], expr.fastRead(i));
274	#else
275	int n1 = ubound & 3;
276	int i = 0;
277	for (; i < n1; ++i)
278	T_update::update(data[i], expr.fastRead(i));
279
280	for (; i < ubound; i += 4)
281	{
282	#ifndef BZ_ARRAY_STACK_TRAVERSAL_CSE_AND_ANTIALIAS
283	T_update::update(data[i], expr.fastRead(i));
284	T_update::update(data[i+1], expr.fastRead(i+1));
285	T_update::update(data[i+2], expr.fastRead(i+2));
286	T_update::update(data[i+3], expr.fastRead(i+3));
287	#else
288	int t1 = i+1;
289	int t2 = i+2;
290	int t3 = i+3;
291
292	_bz_typename T_expr::T_numtype tmp1, tmp2, tmp3, tmp4;
293
294	tmp1 = expr.fastRead(i);
295	tmp2 = expr.fastRead(BZ_NO_PROPAGATE(t1));
296	tmp3 = expr.fastRead(BZ_NO_PROPAGATE(t2));
297	tmp4 = expr.fastRead(BZ_NO_PROPAGATE(t3));
298
299	T_update::update(data[i], BZ_NO_PROPAGATE(tmp1));
300	T_update::update(data[BZ_NO_PROPAGATE(t1)], tmp2);
301	T_update::update(data[BZ_NO_PROPAGATE(t2)], tmp3);
302	T_update::update(data[BZ_NO_PROPAGATE(t3)], tmp4);
303	#endif
304	}
305	#endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
306
307	}
308	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
309	else {
310
311	#ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
312	for (int i=0; i < ubound; i += commonStride)
313	T_update::update(data[i], expr.fastRead(i));
314	#else
315	int n1 = (length(firstRank) & 3) * commonStride;
316
317	int i = 0;
318	for (; i < n1; i += commonStride)
319	T_update::update(data[i], expr.fastRead(i));
320
321	int strideInc = 4 * commonStride;
322	for (; i < ubound; i += strideInc)
323	{
324	T_update::update(data[i], expr.fastRead(i));
325	int i2 = i + commonStride;
326	T_update::update(data[i2], expr.fastRead(i2));
327	int i3 = i + 2 * commonStride;
328	T_update::update(data[i3], expr.fastRead(i3));
329	int i4 = i + 3 * commonStride;
330	T_update::update(data[i4], expr.fastRead(i4));
331	}
332	#endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
333	}
334	#endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
335
336	#else // ! BZ_USE_FAST_READ_ARRAY_EXPR
337
338	#ifdef BZ_DEBUG_TRAVERSE
339	BZ_DEBUG_MESSAGE("Common stride, no fast read");
340	#endif
341	while (iter.data() != last)
342	{
343	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
344	iter.advance(commonStride);
345	expr.advance(commonStride);
346	}
347	#endif
348	}
349	else {
350	while (iter.data() != last)
351	{
352	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
353	iter.advance();
354	expr.advance();
355	}
356	}
357
358	return *this;
359	}
360
361	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
362	inline Array<T_numtype, N_rank>&
363	Array<T_numtype, N_rank>::evaluateWithStackTraversalN(_bz_ArrayExpr<T_expr>
364	expr, T_update)
365	{
366	/*
367	* A stack traversal replaces the usual nested loops:
368	*
369	* for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
370	* for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
371	* for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
372	* A(i,j,k) = 0;
373	*
374	* with a stack data structure. The stack allows this single
375	* routine to replace any number of nested loops.
376	*
377	* For each dimension (loop), these quantities are needed:
378	* - a pointer to the first element encountered in the loop
379	* - the stride associated with the dimension/loop
380	* - a pointer to the last element encountered in the loop
381	*
382	* The basic idea is that entering each loop is a "push" onto the
383	* stack, and exiting each loop is a "pop". In practice, this
384	* routine treats accesses the stack in a random-access way,
385	* which confuses the picture a bit. But conceptually, that's
386	* what is going on.
387	*/
388
389	/*
390	* ordering(0) gives the dimension associated with the smallest
391	* stride (usually; the exceptions have to do with subarrays and
392	* are uninteresting). We call this dimension maxRank; it will
393	* become the innermost "loop".
394	*
395	* Ordering the loops from ordering(N_rank-1) down to
396	* ordering(0) ensures that the largest stride is associated
397	* with the outermost loop, and the smallest stride with the
398	* innermost. This is critical for good performance on
399	* cached machines.
400	*/
401	const int maxRank = ordering(0);
402	const int secondLastRank = ordering(1);
403
404	// Create an iterator for the array receiving the result
405	ArrayIterator<T_numtype, N_rank> iter(*this);
406
407	// Set the initial stack configuration by pushing the pointer
408	// to the first element of the array onto the stack N times.
409
410	int i;
411	for (i=1; i < N_rank; ++i)
412	{
413	iter.push(i);
414	expr.push(i);
415	}
416
417	// Load the strides associated with the innermost loop.
418	iter.loadStride(maxRank);
419	expr.loadStride(maxRank);
420
421	/*
422	* Is the stride in the innermost loop equal to 1? If so,
423	* we might take advantage of this and generate more
424	* efficient code.
425	*/
426	_bz_bool useUnitStride = iter.isUnitStride(maxRank)
427	&& expr.isUnitStride(maxRank);
428
429	/*
430	* Do all array operands share a common stride in the innermost
431	* loop? If so, we can generate more efficient code (but only
432	* if this optimization has been enabled).
433	*/
434	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
435	int commonStride = expr.suggestStride(maxRank);
436	if (iter.suggestStride(maxRank) > commonStride)
437	commonStride = iter.suggestStride(maxRank);
438	bool useCommonStride = iter.isStride(maxRank,commonStride)
439	&& expr.isStride(maxRank,commonStride);
440
441	#ifdef BZ_DEBUG_TRAVERSE
442	BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl
443	<< "commonStride = " << commonStride << " useCommonStride = "
444	<< useCommonStride);
445	#endif
446
447	#else
448	int commonStride = 1;
449	bool useCommonStride = _bz_false;
450	#endif
451
452	/*
453	* The "last" array contains a pointer to the last element
454	* encountered in each "loop".
455	*/
456	const T_numtype* _bz_restrict last[N_rank];
457
458	// Set up the initial state of the "last" array
459	for (i=1; i < N_rank; ++i)
460	last[i] = iter.data() + length(ordering(i)) * stride(ordering(i));
461
462	int lastLength = length(maxRank);
463	int firstNoncollapsedLoop = 1;
464
465	#ifdef BZ_COLLAPSE_LOOPS
466
467	/*
468	* This bit of code handles collapsing loops. When possible,
469	* the N nested loops are converted into a single loop (basically,
470	* the N-dimensional array is treated as a long vector).
471	* This is important for cases where the length of the innermost
472	* loop is very small, for example a 100x100x3 array.
473	* If this code can't collapse all the loops into a single loop,
474	* it will collapse as many loops as possible starting from the
475	* innermost and working out.
476	*/
477
478	// Collapse loops when possible
479	for (i=1; i < N_rank; ++i)
480	{
481	// Figure out which pair of loops we are considering combining.
482	int outerLoopRank = ordering(i);
483	int innerLoopRank = ordering(i-1);
484
485	/*
486	* The canCollapse() routines look at the strides and extents
487	* of the loops, and determine if they can be combined into
488	* one loop.
489	*/
490
491	if (canCollapse(outerLoopRank,innerLoopRank)
492	&& expr.canCollapse(outerLoopRank,innerLoopRank))
493	{
494	lastLength *= length(outerLoopRank);
495	firstNoncollapsedLoop = i+1;
496	}
497	}
498	#endif // BZ_COLLAPSE_LOOPS
499
500	/*
501	* Now we actually perform the loops. This while loop contains
502	* two parts: first, the innermost loop is performed. Then we
503	* exit the loop, and pop our way down the stack until we find
504	* a loop that isn't completed. We then restart the inner loops
505	* and push them onto the stack.
506	*/
507
508	while (true) {
509
510	/*
511	* This bit of code handles the innermost loop. It would look
512	* a lot simpler if it weren't for unit stride and common stride
513	* optimizations; these clutter up the code with multiple versions.
514	*/
515
516	if ((useUnitStride) \|\| (useCommonStride))
517	{
518	T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
519	+ lastLength;
520
521	#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
522
523	/*
524	* The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably
525	* be taken out. This was put in place while the unit stride/
526	* common stride optimizations were being implemented and
527	* tested.
528	*/
529
530	// Calculate the end of the innermost loop
531	int ubound = lastLength * commonStride;
532
533	/*
534	* This is a real kludge. I didn't want to have to write
535	* a const and non-const version of ArrayIterator, so I use a
536	* const iterator and cast away const. This could
537	* probably be avoided with some trick, but the whole routine
538	* is ugly, so why bother.
539	*/
540
541	T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
542
543	/*
544	* BZ_NEEDS_WORK-- need to implement optional unrolling.
545	*/
546	if (commonStride == 1)
547	{
548	for (int i=0; i < ubound; ++i)
549	T_update::update(data[i], expr.fastRead(i));
550	}
551	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
552	else {
553	for (int i=0; i < ubound; i += commonStride)
554	T_update::update(data[i], expr.fastRead(i));
555	}
556	#endif
557	/*
558	* Tidy up for the fact that we haven't actually been
559	* incrementing the iterators in the innermost loop, by
560	* faking it afterward.
561	*/
562	iter.advance(lastLength * commonStride);
563	expr.advance(lastLength * commonStride);
564	#else
565	// !BZ_USE_FAST_READ_ARRAY_EXPR
566	// This bit of code not really needed; should remove at some
567	// point, along with the test for BZ_USE_FAST_READ_ARRAY_EXPR
568
569	while (iter.data() != end)
570	{
571	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
572	iter.advance(commonStride);
573	expr.advance(commonStride);
574	}
575	#endif
576	}
577	else {
578	/*
579	* We don't have a unit stride or common stride in the innermost
580	* loop. This is going to hurt performance. Luckily 95% of
581	* the time, we hit the cases above.
582	*/
583	T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
584	+ lastLength * stride(maxRank);
585
586	while (iter.data() != end)
587	{
588	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
589	iter.advance();
590	expr.advance();
591	}
592	}
593
594
595	/*
596	* We just finished the innermost loop. Now we pop our way down
597	* the stack, until we hit a loop that hasn't completed yet.
598	*/
599	int j = firstNoncollapsedLoop;
600	for (; j < N_rank; ++j)
601	{
602	// Get the next loop
603	int r = ordering(j);
604
605	// Pop-- this restores the data pointers to the first element
606	// encountered in the loop.
607	iter.pop(j);
608	expr.pop(j);
609
610	// Load the stride associated with this loop, and increment
611	// once.
612	iter.loadStride(r);
613	expr.loadStride(r);
614	iter.advance();
615	expr.advance();
616
617	// If we aren't at the end of this loop, then stop popping.
618	if (iter.data() != last[j])
619	break;
620	}
621
622	// Are we completely done?
623	if (j == N_rank)
624	break;
625
626	// No, so push all the inner loops back onto the stack.
627	for (; j >= firstNoncollapsedLoop; --j)
628	{
629	int r2 = ordering(j-1);
630	iter.push(j);
631	expr.push(j);
632	last[j-1] = iter.data() + length(r2) * stride(r2);
633	}
634
635	// Load the stride for the innermost loop again.
636	iter.loadStride(maxRank);
637	expr.loadStride(maxRank);
638	}
639
640	return *this;
641	}
642
643	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
644	inline Array<T_numtype, N_rank>&
645	Array<T_numtype, N_rank>::evaluateWithIndexTraversal1(_bz_ArrayExpr<T_expr>
646	expr, T_update)
647	{
648	TinyVector<int,N_rank> index;
649
650	if (stride(firstRank) == 1)
651	{
652	T_numtype * _bz_restrict iter = data_;
653	int last = ubound(firstRank);
654
655	for (index[0] = lbound(firstRank); index[0] <= last;
656	++index[0])
657	{
658	iter[index[0]] = expr(index);
659	}
660	}
661	else {
662	ArrayIterator<T_numtype, N_rank> iter(*this);
663	iter.loadStride(0);
664	int last = ubound(firstRank);
665
666	for (index[0] = lbound(firstRank); index[0] <= last;
667	++index[0])
668	{
669	T_update::update(const_cast<T_numtype>(iter.data()),
670	expr(index));
671	iter.advance();
672	}
673	}
674
675	return *this;
676	}
677
678	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
679	inline Array<T_numtype, N_rank>&
680	Array<T_numtype, N_rank>::evaluateWithIndexTraversalN(_bz_ArrayExpr<T_expr>
681	expr, T_update)
682	{
683	// Do a stack-type traversal for the destination array and use
684	// index traversal for the source expression
685
686	const int maxRank = ordering(0);
687	const int secondLastRank = ordering(1);
688
689	#ifdef BZ_DEBUG_TRAVERSE
690	cout << "Index traversal: N_rank = " << N_rank << endl;
691	cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
692	<< endl;
693	cout.flush();
694	#endif
695
696	ArrayIterator<T_numtype, N_rank> iter(*this);
697	for (int i=1; i < N_rank; ++i)
698	iter.push(ordering(i));
699
700	iter.loadStride(maxRank);
701
702	TinyVector<int,N_rank> index, last;
703
704	index = storage_.base();
705	last = storage_.base() + length_;
706
707	int lastLength = length(maxRank);
708
709	while (true) {
710
711	for (index[maxRank] = base(maxRank);
712	index[maxRank] < last[maxRank];
713	++index[maxRank])
714	{
715	#ifdef BZ_DEBUG_TRAVERSE
716	#if 0
717	cout << "(" << index[0] << "," << index[1] << ") " << endl;
718	cout.flush();
719	#endif
720	#endif
721
722	T_update::update(const_cast<T_numtype>(iter.data()), expr(index));
723	iter.advance();
724	}
725
726	int j = 1;
727	for (; j < N_rank; ++j)
728	{
729	iter.pop(ordering(j));
730	iter.loadStride(ordering(j));
731	iter.advance();
732
733	index[ordering(j-1)] = base(ordering(j-1));
734	++index[ordering(j)];
735	if (index[ordering(j)] != last[ordering(j)])
736	break;
737	}
738
739	if (j == N_rank)
740	break;
741
742	for (; j > 0; --j)
743	{
744	iter.push(ordering(j));
745	}
746	iter.loadStride(maxRank);
747	}
748
749	return *this;
750	}
751
752	// Fast traversals require <set> from the ISO/ANSI C++ standard library
753
754	#ifdef BZ_HAVE_STD
755
756	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
757	inline Array<T_numtype, N_rank>&
758	Array<T_numtype, N_rank>::evaluateWithFastTraversal(
759	const TraversalOrder<N_rank - 1>& order, _bz_ArrayExpr<T_expr> expr,
760	T_update)
761	{
762	const int maxRank = ordering(0);
763	const int secondLastRank = ordering(1);
764
765	#ifdef BZ_DEBUG_TRAVERSE
766	cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
767	<< endl;
768	#endif
769
770	ArrayIterator<T_numtype, N_rank> iter(*this);
771	iter.push(0);
772	expr.push(0);
773
774	_bz_bool useUnitStride = iter.isUnitStride(maxRank)
775	&& expr.isUnitStride(maxRank);
776
777	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
778	int commonStride = expr.suggestStride(maxRank);
779	if (iter.suggestStride(maxRank) > commonStride)
780	commonStride = iter.suggestStride(maxRank);
781	bool useCommonStride = iter.isStride(maxRank,commonStride)
782	&& expr.isStride(maxRank,commonStride);
783	#else
784	int commonStride = 1;
785	bool useCommonStride = _bz_false;
786	#endif
787
788	int lastLength = length(maxRank);
789
790	for (int i=0; i < order.length(); ++i)
791	{
792	iter.pop(0);
793	expr.pop(0);
794
795	#ifdef BZ_DEBUG_TRAVERSE
796	cerr << "Traversing: " << order[i] << endl;
797	#endif
798	// Position the iterator at the start of the next column
799	for (int j=1; j < N_rank; ++j)
800	{
801	iter.loadStride(ordering(j));
802	expr.loadStride(ordering(j));
803
804	int offset = order[i][j-1];
805	iter.advance(offset);
806	expr.advance(offset);
807	}
808
809	iter.loadStride(maxRank);
810	expr.loadStride(maxRank);
811
812	// Evaluate the expression along the column
813
814	if ((useUnitStride) \|\| (useCommonStride))
815	{
816	T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
817	+ lastLength * commonStride;
818
819	#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
820	int ubound = lastLength * commonStride;
821	T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
822
823	if (commonStride == 1)
824	{
825	#ifndef BZ_ARRAY_FAST_TRAVERSAL_UNROLL
826	for (int i=0; i < ubound; ++i)
827	T_update::update(data[i], expr.fastRead(i));
828	#else
829	int n1 = ubound & 3;
830	int i=0;
831	for (; i < n1; ++i)
832	T_update::update(data[i], expr.fastRead(i));
833
834	for (; i < ubound; i += 4)
835	{
836	T_update::update(data[i], expr.fastRead(i));
837	T_update::update(data[i+1], expr.fastRead(i+1));
838	T_update::update(data[i+2], expr.fastRead(i+2));
839	T_update::update(data[i+3], expr.fastRead(i+3));
840	}
841	#endif // BZ_ARRAY_FAST_TRAVERSAL_UNROLL
842	}
843	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
844	else {
845	for (int i=0; i < ubound; i += commonStride)
846	T_update::update(data[i], expr.fastRead(i));
847	}
848	#endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
849
850	iter.advance(lastLength * commonStride);
851	expr.advance(lastLength * commonStride);
852	#else // ! BZ_USE_FAST_READ_ARRAY_EXPR
853	while (iter.data() != last)
854	{
855	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
856	iter.advance(commonStride);
857	expr.advance(commonStride);
858	}
859	#endif // BZ_USE_FAST_READ_ARRAY_EXPR
860
861	}
862	else {
863	// No common stride
864
865	T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
866	+ lastLength * stride(maxRank);
867
868	while (iter.data() != last)
869	{
870	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
871	iter.advance();
872	expr.advance();
873	}
874	}
875	}
876
877	return *this;
878	}
879	#endif // BZ_HAVE_STD
880
881	#ifdef BZ_ARRAY_2D_NEW_STENCIL_TILING
882
883	#ifdef BZ_ARRAY_2D_STENCIL_TILING
884
885	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
886	inline Array<T_numtype, N_rank>&
887	Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
888	expr, T_update)
889	{
890	const int minorRank = ordering(0);
891	const int majorRank = ordering(1);
892
893	ArrayIterator<T_numtype, N_rank> iter(*this);
894	iter.push(0);
895	expr.push(0);
896
897	#ifdef BZ_2D_STENCIL_DEBUG
898	int count = 0;
899	#endif
900
901	_bz_bool useUnitStride = iter.isUnitStride(minorRank)
902	&& expr.isUnitStride(minorRank);
903
904	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
905	int commonStride = expr.suggestStride(minorRank);
906	if (iter.suggestStride(minorRank) > commonStride)
907	commonStride = iter.suggestStride(minorRank);
908	bool useCommonStride = iter.isStride(minorRank,commonStride)
909	&& expr.isStride(minorRank,commonStride);
910	#else
911	int commonStride = 1;
912	bool useCommonStride = _bz_false;
913	#endif
914
915	// Determine if a common major stride exists
916	int commonMajorStride = expr.suggestStride(majorRank);
917	if (iter.suggestStride(majorRank) > commonMajorStride)
918	commonMajorStride = iter.suggestStride(majorRank);
919	bool haveCommonMajorStride = iter.isStride(majorRank,commonMajorStride)
920	&& expr.isStride(majorRank,commonMajorStride);
921
922
923	int maxi = length(majorRank);
924	int maxj = length(minorRank);
925
926	const int tileHeight = 16, tileWidth = 3;
927
928	int bi, bj;
929	for (bi=0; bi < maxi; bi += tileHeight)
930	{
931	int ni = bi + tileHeight;
932	if (ni > maxi)
933	ni = maxi;
934
935	// Move back to the beginning of the array
936	iter.pop(0);
937	expr.pop(0);
938
939	// Move to the start of this tile row
940	iter.loadStride(majorRank);
941	iter.advance(bi);
942	expr.loadStride(majorRank);
943	expr.advance(bi);
944
945	// Save this position
946	iter.push(1);
947	expr.push(1);
948
949	for (bj=0; bj < maxj; bj += tileWidth)
950	{
951	// Move to the beginning of the tile row
952	iter.pop(1);
953	expr.pop(1);
954
955	// Move to the top of the current tile (bi,bj)
956	iter.loadStride(minorRank);
957	iter.advance(bj);
958	expr.loadStride(minorRank);
959	expr.advance(bj);
960
961	if (bj + tileWidth <= maxj)
962	{
963	// Strip mining
964
965	if ((useUnitStride) && (haveCommonMajorStride))
966	{
967	int offset = 0;
968	T_numtype* _bz_restrict data = const_cast<T_numtype*>
969	(iter.data());
970
971	for (int i=bi; i < ni; ++i)
972	{
973	_bz_typename T_expr::T_numtype tmp1, tmp2, tmp3;
974
975	// Common subexpression elimination -- compilers
976	// won't necessarily do this on their own.
977	int t1 = offset+1;
978	int t2 = offset+2;
979
980	tmp1 = expr.fastRead(offset);
981	tmp2 = expr.fastRead(t1);
982	tmp3 = expr.fastRead(t2);
983
984	T_update::update(data[0], tmp1);
985	T_update::update(data[1], tmp2);
986	T_update::update(data[2], tmp3);
987
988	offset += commonMajorStride;
989	data += commonMajorStride;
990
991	#ifdef BZ_2D_STENCIL_DEBUG
992	count += 3;
993	#endif
994	}
995	}
996	else {
997
998	for (int i=bi; i < ni; ++i)
999	{
1000	iter.loadStride(minorRank);
1001	expr.loadStride(minorRank);
1002
1003	// Loop through current row elements
1004	T_update::update(const_cast<T_numtype>(iter.data()),
1005	*expr);
1006	iter.advance();
1007	expr.advance();
1008
1009	T_update::update(const_cast<T_numtype>(iter.data()),
1010	*expr);
1011	iter.advance();
1012	expr.advance();
1013
1014	T_update::update(const_cast<T_numtype>(iter.data()),
1015	*expr);
1016	iter.advance(-2);
1017	expr.advance(-2);
1018
1019	iter.loadStride(majorRank);
1020	expr.loadStride(majorRank);
1021	iter.advance();
1022	expr.advance();
1023
1024	#ifdef BZ_2D_STENCIL_DEBUG
1025	count += 3;
1026	#endif
1027
1028	}
1029	}
1030	}
1031	else {
1032
1033	// This code handles partial tiles at the bottom of the
1034	// array.
1035
1036	for (int j=bj; j < maxj; ++j)
1037	{
1038	iter.loadStride(majorRank);
1039	expr.loadStride(majorRank);
1040
1041	for (int i=bi; i < ni; ++i)
1042	{
1043	T_update::update(const_cast<T_numtype>(iter.data()),
1044	*expr);
1045	iter.advance();
1046	expr.advance();
1047	#ifdef BZ_2D_STENCIL_DEBUG
1048	++count;
1049	#endif
1050
1051	}
1052
1053	// Move back to the top of this column
1054	iter.advance(bi-ni);
1055	expr.advance(bi-ni);
1056
1057	// Move over to the next column
1058	iter.loadStride(minorRank);
1059	expr.loadStride(minorRank);
1060
1061	iter.advance();
1062	expr.advance();
1063	}
1064	}
1065	}
1066	}
1067
1068	#ifdef BZ_2D_STENCIL_DEBUG
1069	cout << "BZ_2D_STENCIL_DEBUG: count = " << count << endl;
1070	#endif
1071
1072	return *this;
1073	}
1074
1075	#endif // BZ_ARRAY_2D_STENCIL_TILING
1076	#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
1077
1078
1079
1080	#ifndef BZ_ARRAY_2D_NEW_STENCIL_TILING
1081
1082	#ifdef BZ_ARRAY_2D_STENCIL_TILING
1083
1084	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
1085	inline Array<T_numtype, N_rank>&
1086	Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
1087	expr, T_update)
1088	{
1089	const int minorRank = ordering(0);
1090	const int majorRank = ordering(1);
1091
1092	const int blockSize = 16;
1093
1094	ArrayIterator<T_numtype, N_rank> iter(*this);
1095	iter.push(0);
1096	expr.push(0);
1097
1098	_bz_bool useUnitStride = iter.isUnitStride(minorRank)
1099	&& expr.isUnitStride(minorRank);
1100
1101	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1102	int commonStride = expr.suggestStride(minorRank);
1103	if (iter.suggestStride(minorRank) > commonStride)
1104	commonStride = iter.suggestStride(minorRank);
1105	bool useCommonStride = iter.isStride(minorRank,commonStride)
1106	&& expr.isStride(minorRank,commonStride);
1107	#else
1108	int commonStride = 1;
1109	bool useCommonStride = _bz_false;
1110	#endif
1111
1112	int maxi = length(majorRank);
1113	int maxj = length(minorRank);
1114
1115	int bi, bj;
1116	for (bi=0; bi < maxi; bi += blockSize)
1117	{
1118	int ni = bi + blockSize;
1119	if (ni > maxi)
1120	ni = maxi;
1121
1122	for (bj=0; bj < maxj; bj += blockSize)
1123	{
1124	int nj = bj + blockSize;
1125	if (nj > maxj)
1126	nj = maxj;
1127
1128	// Move to the beginning of the array
1129	iter.pop(0);
1130	expr.pop(0);
1131
1132	// Move to the beginning of the tile (bi,bj)
1133	iter.loadStride(majorRank);
1134	iter.advance(bi);
1135	iter.loadStride(minorRank);
1136	iter.advance(bj);
1137
1138	expr.loadStride(majorRank);
1139	expr.advance(bi);
1140	expr.loadStride(minorRank);
1141	expr.advance(bj);
1142
1143	// Loop through tile rows
1144	for (int i=bi; i < ni; ++i)
1145	{
1146	// Save the beginning of this tile row
1147	iter.push(1);
1148	expr.push(1);
1149
1150	// Load the minor stride
1151	iter.loadStride(minorRank);
1152	expr.loadStride(minorRank);
1153
1154	if (useUnitStride)
1155	{
1156	T_numtype* _bz_restrict data = const_cast<T_numtype*>
1157	(iter.data());
1158
1159	int ubound = (nj-bj);
1160	for (int j=0; j < ubound; ++j)
1161	T_update::update(data[j], expr.fastRead(j));
1162	}
1163	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1164	else if (useCommonStride)
1165	{
1166	int ubound = (nj-bj) * commonStride;
1167	T_numtype* _bz_restrict data = const_cast<T_numtype*>
1168	(iter.data());
1169
1170	for (int j=0; j < ubound; j += commonStride)
1171	T_update::update(data[j], expr.fastRead(j));
1172	}
1173	#endif
1174	else {
1175	for (int j=bj; j < nj; ++j)
1176	{
1177	// Loop through current row elements
1178	T_update::update(const_cast<T_numtype>(iter.data()),
1179	*expr);
1180	iter.advance();
1181	expr.advance();
1182	}
1183	}
1184
1185	// Move back to the beginning of the tile row, then
1186	// move to the next row
1187	iter.pop(1);
1188	iter.loadStride(majorRank);
1189	iter.advance(1);
1190
1191	expr.pop(1);
1192	expr.loadStride(majorRank);
1193	expr.advance(1);
1194	}
1195	}
1196	}
1197
1198	return *this;
1199	}
1200	#endif // BZ_ARRAY_2D_STENCIL_TILING
1201	#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
1202
1203	BZ_NAMESPACE_END
1204
1205	#endif // BZ_ARRAYEVAL_CC
1206

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: Sophya/trunk/SophyaExt/Blitz/blitz/array/eval.cc@ 3477

Download in other formats: