Context Navigation

eval.cc

Visit:

Last change on this file was 658, checked in by ansari, 26 years ago
no message
File size: 37.7 KB

Rev	Line
[658]	1	/*
	2	* $Id: eval.cc,v 1.1.1.1 1999-11-26 16:37:06 ansari Exp $
	3	*
	4	* $Log: not supported by cvs2svn $
	5	// Revision 1.1.1.1 1999/04/09 17:59:03 ansari
	6	// Creation module DPC/Blitz (blitz 0.4) Reza 09/04/99
	7	//
	8	* Revision 1.2 1998/03/14 00:04:47 tveldhui
	9	* 0.2-alpha-05
	10	*
	11	* Revision 1.1 1998/02/25 20:04:01 tveldhui
	12	* Initial revision
	13	*
	14	*/
	15
	16	#ifndef BZ_ARRAYEVAL_CC
	17	#define BZ_ARRAYEVAL_CC
	18
	19	#ifndef BZ_ARRAY_H
	20	#error <blitz/array/eval.cc> must be included via <blitz/array.h>
	21	#endif
	22
	23	BZ_NAMESPACE(blitz)
	24
	25	/*
	26	* Assign an expression to an array. For performance reasons, there are
	27	* several traversal mechanisms:
	28	*
	29	* - Index traversal scans through the destination array in storage order.
	30	* The expression is evaluated using a TinyVector<int,N> operand. This
	31	* version is used only when there are index placeholders in the expression
	32	* (see <blitz/indexexpr.h>)
	33	* - Stack traversal also scans through the destination array in storage
	34	* order. However, push/pop stack iterators are used.
	35	* - Fast traversal follows a Hilbert (or other) space-filling curve to
	36	* improve cache reuse for stencilling operations. Currently, the
	37	* space filling curves must be generated by calling
	38	* generateFastTraversalOrder(TinyVector<int,N_dimensions>).
	39	* - 2D tiled traversal follows a tiled traversal, to improve cache reuse
	40	* for 2D stencils. Space filling curves have too much overhead to use
	41	* in two-dimensions.
	42	*
	43	* _bz_tryFastTraversal is a helper class. Fast traversals are only
	44	* attempted if the expression looks like a stencil -- it's at least
	45	* three-dimensional, has at least six array operands, and there are
	46	* no index placeholders in the expression. These are all things which
	47	* can be checked at compile time, so the if()/else() syntax has been
	48	* replaced with this class template.
	49	*/
	50
	51	// Fast traversals require <set> from the ISO/ANSI C++ standard library
	52	#ifdef BZ_HAVE_STD
	53
	54	template<_bz_bool canTryFastTraversal>
	55	struct _bz_tryFastTraversal {
	56	template<class T_numtype, int N_rank, class T_expr, class T_update>
	57	static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
	58	_bz_ArrayExpr<T_expr> expr, T_update)
	59	{
	60	return _bz_false;
	61	}
	62	};
	63
	64	template<>
	65	struct _bz_tryFastTraversal<_bz_true> {
	66	template<class T_numtype, int N_rank, class T_expr, class T_update>
	67	static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
	68	_bz_ArrayExpr<T_expr> expr, T_update)
	69	{
	70	// See if there's an appropriate space filling curve available.
	71	// Currently fast traversals use an N-1 dimensional curve. The
	72	// Nth dimension column corresponding to each point on the curve
	73	// is traversed in the normal fashion.
	74	TraversalOrderCollection<N_rank-1> traversals;
	75	TinyVector<int, N_rank - 1> traversalGridSize;
	76
	77	for (int i=0; i < N_rank - 1; ++i)
	78	traversalGridSize[i] = array.length(array.ordering(i+1));
	79
	80	#ifdef BZ_DEBUG_TRAVERSE
	81	cout << "traversalGridSize = " << traversalGridSize << endl;
	82	cout.flush();
	83	#endif
	84
	85	const TraversalOrder<N_rank-1>* order =
	86	traversals.find(traversalGridSize);
	87
	88	if (order)
	89	{
	90	#ifdef BZ_DEBUG_TRAVERSE
	91	cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
	92	<< ", " << N_rank << ">: Using stack traversal" << endl;
	93	#endif
	94	// A curve was available -- use fast traversal.
	95	array.evaluateWithFastTraversal(*order, expr, T_update());
	96	return _bz_true;
	97	}
	98
	99	return _bz_false;
	100	}
	101	};
	102
	103	#endif // BZ_HAVE_STD
	104
	105	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
	106	inline Array<T_numtype, N_rank>&
	107	Array<T_numtype, N_rank>::evaluate(_bz_ArrayExpr<T_expr> expr, T_update)
	108	{
	109	// Check that all arrays have the same shape
	110	#ifdef BZ_DEBUG
	111	if (!expr.shapeCheck(shape()))
	112	{
	113	if (assertFailMode == _bz_false)
	114	{
	115	cerr << "[Blitz++] Shape check failed: Module " << __FILE__
	116	<< " line " << __LINE__ << endl
	117	<< " Expression: ";
	118	prettyPrintFormat format(_bz_true); // Use terse formatting
	119	string str;
	120	expr.prettyPrint(str, format);
	121	cerr << str << endl ;
	122	}
	123
	124	#if 0
	125	// Shape dumping is broken by change to using string for prettyPrint
	126	<< " Shapes: " << shape() << " = ";
	127	prettyPrintFormat format2;
	128	format2.setDumpArrayShapesMode();
	129	expr.prettyPrint(cerr, format2);
	130	cerr << endl;
	131	#endif
	132	BZ_PRE_FAIL;
	133	}
	134	#endif
	135
	136	BZPRECHECK(expr.shapeCheck(shape()),
	137	"Shape check failed." << endl << "Expression:");
	138
	139	BZPRECHECK((T_expr::rank == N_rank) \|\| (T_expr::numArrayOperands == 0),
	140	"Assigned rank " << T_expr::rank << " expression to rank "
	141	<< N_rank << " array.");
	142
	143	#ifdef BZ_DEBUG_TRAVERSE
	144	cout << "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders
	145	<< endl; cout.flush();
	146	#endif
	147
	148	// Tau profiling code. Provide Tau with a pretty-printed version of
	149	// the expression.
	150	// NEEDS_WORK-- use a static initializer somehow.
	151
	152	#ifdef BZ_TAU_PROFILING
	153	static string exprDescription;
	154	if (!exprDescription.length()) // faked static initializer
	155	{
	156	exprDescription = "A";
	157	prettyPrintFormat format(_bz_true); // Terse mode on
	158	format.nextArrayOperandSymbol();
	159	T_update::prettyPrint(exprDescription);
	160	expr.prettyPrint(exprDescription, format);
	161	}
	162	TAU_PROFILE(" ", exprDescription, TAU_BLITZ);
	163	#endif
	164
	165	// Determine which evaluation mechanism to use
	166	if (T_expr::numIndexPlaceholders > 0)
	167	{
	168	// The expression involves index placeholders, so have to
	169	// use index traversal rather than stack traversal.
	170
	171	if (N_rank == 1)
	172	return evaluateWithIndexTraversal1(expr, T_update());
	173	else
	174	return evaluateWithIndexTraversalN(expr, T_update());
	175	}
	176	else {
	177
	178	// If this expression looks like an array stencil, then attempt to
	179	// use a fast traversal order.
	180	// Fast traversals require <set> from the ISO/ANSI C++ standard
	181	// library.
	182
	183	#ifdef BZ_HAVE_STD
	184
	185	enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)
	186	&& (T_expr::numIndexPlaceholders == 0) };
	187
	188	if (_bz_tryFastTraversal<isStencil>::tryFast(*this, expr, T_update()))
	189	return *this;
	190
	191	#endif
	192
	193	#ifdef BZ_ARRAY_2D_STENCIL_TILING
	194	// Does this look like a 2-dimensional stencil on a largeish
	195	// array?
	196
	197	if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))
	198	{
	199	// Use a heuristic to determine whether a tiled traversal
	200	// is desirable. First, estimate how much L1 cache is needed
	201	// to achieve a high hit rate using the stack traversal.
	202	// Try to err on the side of using tiled traversal even when
	203	// it isn't strictly needed.
	204
	205	// Assumptions:
	206	// Stencil width 3
	207	// 3 arrays involved in stencil
	208	// Uniform data type in arrays (all T_numtype)
	209
	210	int cacheNeeded = 3 * 3 * sizeof(T_numtype) * length(ordering(0));
	211	if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE)
	212	return evaluateWithTiled2DTraversal(expr, T_update());
	213	}
	214
	215	#endif
	216
	217	// If fast traversal isn't available or appropriate, then just
	218	// do a stack traversal.
	219	if (N_rank == 1)
	220	return evaluateWithStackTraversal1(expr, T_update());
	221	else
	222	return evaluateWithStackTraversalN(expr, T_update());
	223	}
	224	}
	225
	226	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
	227	inline Array<T_numtype, N_rank>&
	228	Array<T_numtype, N_rank>::evaluateWithStackTraversal1(_bz_ArrayExpr<T_expr>
	229	expr, T_update)
	230	{
	231	#ifdef BZ_DEBUG_TRAVERSE
	232	BZ_DEBUG_MESSAGE("Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
	233	<< ", " << N_rank << ">: Using stack traversal");
	234	#endif
	235	ArrayIterator<T_numtype, N_rank> iter(*this);
	236	iter.loadStride(firstRank);
	237	expr.loadStride(firstRank);
	238
	239	_bz_bool useUnitStride = iter.isUnitStride(firstRank)
	240	&& expr.isUnitStride(firstRank);
	241
	242	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	243	int commonStride = expr.suggestStride(firstRank);
	244	if (iter.suggestStride(firstRank) > commonStride)
	245	commonStride = iter.suggestStride(firstRank);
	246	bool useCommonStride = iter.isStride(firstRank,commonStride)
	247	&& expr.isStride(firstRank,commonStride);
	248
	249	#ifdef BZ_DEBUG_TRAVERSE
	250	BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE:" << endl
	251	<< " commonStride = " << commonStride << " useCommonStride = "
	252	<< useCommonStride);
	253	#endif
	254	#else
	255	int commonStride = 1;
	256	bool useCommonStride = _bz_false;
	257	#endif
	258
	259	const T_numtype * last = iter.data() + length(firstRank)
	260	* stride(firstRank);
	261
	262	if (useUnitStride \|\| useCommonStride)
	263	{
	264	#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
	265
	266	#ifdef BZ_DEBUG_TRAVERSE
	267	BZ_DEBUG_MESSAGE("BZ_USE_FAST_READ_ARRAY_EXPR with commonStride");
	268	#endif
	269	int ubound = length(firstRank) * commonStride;
	270	T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
	271
	272	if (commonStride == 1)
	273	{
	274	#ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
	275	for (int i=0; i < ubound; ++i)
	276	T_update::update(data[i], expr.fastRead(i));
	277	#else
	278	int n1 = ubound & 3;
	279	int i = 0;
	280	for (; i < n1; ++i)
	281	T_update::update(data[i], expr.fastRead(i));
	282
	283	for (; i < ubound; i += 4)
	284	{
	285	#ifndef BZ_ARRAY_STACK_TRAVERSAL_CSE_AND_ANTIALIAS
	286	T_update::update(data[i], expr.fastRead(i));
	287	T_update::update(data[i+1], expr.fastRead(i+1));
	288	T_update::update(data[i+2], expr.fastRead(i+2));
	289	T_update::update(data[i+3], expr.fastRead(i+3));
	290	#else
	291	int t1 = i+1;
	292	int t2 = i+2;
	293	int t3 = i+3;
	294
	295	_bz_typename T_expr::T_numtype tmp1, tmp2, tmp3, tmp4;
	296
	297	tmp1 = expr.fastRead(i);
	298	tmp2 = expr.fastRead(BZ_NO_PROPAGATE(t1));
	299	tmp3 = expr.fastRead(BZ_NO_PROPAGATE(t2));
	300	tmp4 = expr.fastRead(BZ_NO_PROPAGATE(t3));
	301
	302	T_update::update(data[i], BZ_NO_PROPAGATE(tmp1));
	303	T_update::update(data[BZ_NO_PROPAGATE(t1)], tmp2);
	304	T_update::update(data[BZ_NO_PROPAGATE(t2)], tmp3);
	305	T_update::update(data[BZ_NO_PROPAGATE(t3)], tmp4);
	306	#endif
	307	}
	308	#endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
	309
	310	}
	311	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	312	else {
	313
	314	#ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
	315	for (int i=0; i < ubound; i += commonStride)
	316	T_update::update(data[i], expr.fastRead(i));
	317	#else
	318	int n1 = (length(firstRank) & 3) * commonStride;
	319
	320	int i = 0;
	321	for (; i < n1; i += commonStride)
	322	T_update::update(data[i], expr.fastRead(i));
	323
	324	int strideInc = 4 * commonStride;
	325	for (; i < ubound; i += strideInc)
	326	{
	327	T_update::update(data[i], expr.fastRead(i));
	328	int i2 = i + commonStride;
	329	T_update::update(data[i2], expr.fastRead(i2));
	330	int i3 = i + 2 * commonStride;
	331	T_update::update(data[i3], expr.fastRead(i3));
	332	int i4 = i + 3 * commonStride;
	333	T_update::update(data[i4], expr.fastRead(i4));
	334	}
	335	#endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
	336	}
	337	#endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	338
	339	#else // ! BZ_USE_FAST_READ_ARRAY_EXPR
	340
	341	#ifdef BZ_DEBUG_TRAVERSE
	342	BZ_DEBUG_MESSAGE("Common stride, no fast read");
	343	#endif
	344	while (iter.data() != last)
	345	{
	346	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
	347	iter.advance(commonStride);
	348	expr.advance(commonStride);
	349	}
	350	#endif
	351	}
	352	else {
	353	while (iter.data() != last)
	354	{
	355	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
	356	iter.advance();
	357	expr.advance();
	358	}
	359	}
	360
	361	return *this;
	362	}
	363
	364	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
	365	inline Array<T_numtype, N_rank>&
	366	Array<T_numtype, N_rank>::evaluateWithStackTraversalN(_bz_ArrayExpr<T_expr>
	367	expr, T_update)
	368	{
	369	/*
	370	* A stack traversal replaces the usual nested loops:
	371	*
	372	* for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
	373	* for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
	374	* for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
	375	* A(i,j,k) = 0;
	376	*
	377	* with a stack data structure. The stack allows this single
	378	* routine to replace any number of nested loops.
	379	*
	380	* For each dimension (loop), these quantities are needed:
	381	* - a pointer to the first element encountered in the loop
	382	* - the stride associated with the dimension/loop
	383	* - a pointer to the last element encountered in the loop
	384	*
	385	* The basic idea is that entering each loop is a "push" onto the
	386	* stack, and exiting each loop is a "pop". In practice, this
	387	* routine treats accesses the stack in a random-access way,
	388	* which confuses the picture a bit. But conceptually, that's
	389	* what is going on.
	390	*/
	391
	392	/*
	393	* ordering(0) gives the dimension associated with the smallest
	394	* stride (usually; the exceptions have to do with subarrays and
	395	* are uninteresting). We call this dimension maxRank; it will
	396	* become the innermost "loop".
	397	*
	398	* Ordering the loops from ordering(N_rank-1) down to
	399	* ordering(0) ensures that the largest stride is associated
	400	* with the outermost loop, and the smallest stride with the
	401	* innermost. This is critical for good performance on
	402	* cached machines.
	403	*/
	404	const int maxRank = ordering(0);
	405	const int secondLastRank = ordering(1);
	406
	407	// Create an iterator for the array receiving the result
	408	ArrayIterator<T_numtype, N_rank> iter(*this);
	409
	410	// Set the initial stack configuration by pushing the pointer
	411	// to the first element of the array onto the stack N times.
	412
	413	int i;
	414	for (i=1; i < N_rank; ++i)
	415	{
	416	iter.push(i);
	417	expr.push(i);
	418	}
	419
	420	// Load the strides associated with the innermost loop.
	421	iter.loadStride(maxRank);
	422	expr.loadStride(maxRank);
	423
	424	/*
	425	* Is the stride in the innermost loop equal to 1? If so,
	426	* we might take advantage of this and generate more
	427	* efficient code.
	428	*/
	429	_bz_bool useUnitStride = iter.isUnitStride(maxRank)
	430	&& expr.isUnitStride(maxRank);
	431
	432	/*
	433	* Do all array operands share a common stride in the innermost
	434	* loop? If so, we can generate more efficient code (but only
	435	* if this optimization has been enabled).
	436	*/
	437	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	438	int commonStride = expr.suggestStride(maxRank);
	439	if (iter.suggestStride(maxRank) > commonStride)
	440	commonStride = iter.suggestStride(maxRank);
	441	bool useCommonStride = iter.isStride(maxRank,commonStride)
	442	&& expr.isStride(maxRank,commonStride);
	443
	444	#ifdef BZ_DEBUG_TRAVERSE
	445	BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl
	446	<< "commonStride = " << commonStride << " useCommonStride = "
	447	<< useCommonStride);
	448	#endif
	449
	450	#else
	451	int commonStride = 1;
	452	bool useCommonStride = _bz_false;
	453	#endif
	454
	455	/*
	456	* The "last" array contains a pointer to the last element
	457	* encountered in each "loop".
	458	*/
	459	const T_numtype* _bz_restrict last[N_rank];
	460
	461	// Set up the initial state of the "last" array
	462	for (i=1; i < N_rank; ++i)
	463	last[i] = iter.data() + length(ordering(i)) * stride(ordering(i));
	464
	465	int lastLength = length(maxRank);
	466	int firstNoncollapsedLoop = 1;
	467
	468	#ifdef BZ_COLLAPSE_LOOPS
	469
	470	/*
	471	* This bit of code handles collapsing loops. When possible,
	472	* the N nested loops are converted into a single loop (basically,
	473	* the N-dimensional array is treated as a long vector).
	474	* This is important for cases where the length of the innermost
	475	* loop is very small, for example a 100x100x3 array.
	476	* If this code can't collapse all the loops into a single loop,
	477	* it will collapse as many loops as possible starting from the
	478	* innermost and working out.
	479	*/
	480
	481	// Collapse loops when possible
	482	for (i=1; i < N_rank; ++i)
	483	{
	484	// Figure out which pair of loops we are considering combining.
	485	int outerLoopRank = ordering(i);
	486	int innerLoopRank = ordering(i-1);
	487
	488	/*
	489	* The canCollapse() routines look at the strides and extents
	490	* of the loops, and determine if they can be combined into
	491	* one loop.
	492	*/
	493
	494	if (canCollapse(outerLoopRank,innerLoopRank)
	495	&& expr.canCollapse(outerLoopRank,innerLoopRank))
	496	{
	497	lastLength *= length(outerLoopRank);
	498	firstNoncollapsedLoop = i+1;
	499	}
	500	}
	501	#endif // BZ_COLLAPSE_LOOPS
	502
	503	/*
	504	* Now we actually perform the loops. This while loop contains
	505	* two parts: first, the innermost loop is performed. Then we
	506	* exit the loop, and pop our way down the stack until we find
	507	* a loop that isn't completed. We then restart the inner loops
	508	* and push them onto the stack.
	509	*/
	510
	511	while (true) {
	512
	513	/*
	514	* This bit of code handles the innermost loop. It would look
	515	* a lot simpler if it weren't for unit stride and common stride
	516	* optimizations; these clutter up the code with multiple versions.
	517	*/
	518
	519	if ((useUnitStride) \|\| (useCommonStride))
	520	{
	521	T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
	522	+ lastLength;
	523
	524	#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
	525
	526	/*
	527	* The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably
	528	* be taken out. This was put in place while the unit stride/
	529	* common stride optimizations were being implemented and
	530	* tested.
	531	*/
	532
	533	// Calculate the end of the innermost loop
	534	int ubound = lastLength * commonStride;
	535
	536	/*
	537	* This is a real kludge. I didn't want to have to write
	538	* a const and non-const version of ArrayIterator, so I use a
	539	* const iterator and cast away const. This could
	540	* probably be avoided with some trick, but the whole routine
	541	* is ugly, so why bother.
	542	*/
	543
	544	T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
	545
	546	/*
	547	* BZ_NEEDS_WORK-- need to implement optional unrolling.
	548	*/
	549	if (commonStride == 1)
	550	{
	551	for (int i=0; i < ubound; ++i)
	552	T_update::update(data[i], expr.fastRead(i));
	553	}
	554	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	555	else {
	556	for (int i=0; i < ubound; i += commonStride)
	557	T_update::update(data[i], expr.fastRead(i));
	558	}
	559	#endif
	560	/*
	561	* Tidy up for the fact that we haven't actually been
	562	* incrementing the iterators in the innermost loop, by
	563	* faking it afterward.
	564	*/
	565	iter.advance(lastLength * commonStride);
	566	expr.advance(lastLength * commonStride);
	567	#else
	568	// !BZ_USE_FAST_READ_ARRAY_EXPR
	569	// This bit of code not really needed; should remove at some
	570	// point, along with the test for BZ_USE_FAST_READ_ARRAY_EXPR
	571
	572	while (iter.data() != end)
	573	{
	574	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
	575	iter.advance(commonStride);
	576	expr.advance(commonStride);
	577	}
	578	#endif
	579	}
	580	else {
	581	/*
	582	* We don't have a unit stride or common stride in the innermost
	583	* loop. This is going to hurt performance. Luckily 95% of
	584	* the time, we hit the cases above.
	585	*/
	586	T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
	587	+ lastLength * stride(maxRank);
	588
	589	while (iter.data() != end)
	590	{
	591	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
	592	iter.advance();
	593	expr.advance();
	594	}
	595	}
	596
	597
	598	/*
	599	* We just finished the innermost loop. Now we pop our way down
	600	* the stack, until we hit a loop that hasn't completed yet.
	601	*/
	602	int j = firstNoncollapsedLoop;
	603	for (; j < N_rank; ++j)
	604	{
	605	// Get the next loop
	606	int r = ordering(j);
	607
	608	// Pop-- this restores the data pointers to the first element
	609	// encountered in the loop.
	610	iter.pop(j);
	611	expr.pop(j);
	612
	613	// Load the stride associated with this loop, and increment
	614	// once.
	615	iter.loadStride(r);
	616	expr.loadStride(r);
	617	iter.advance();
	618	expr.advance();
	619
	620	// If we aren't at the end of this loop, then stop popping.
	621	if (iter.data() != last[j])
	622	break;
	623	}
	624
	625	// Are we completely done?
	626	if (j == N_rank)
	627	break;
	628
	629	// No, so push all the inner loops back onto the stack.
	630	for (; j >= firstNoncollapsedLoop; --j)
	631	{
	632	int r2 = ordering(j-1);
	633	iter.push(j);
	634	expr.push(j);
	635	last[j-1] = iter.data() + length(r2) * stride(r2);
	636	}
	637
	638	// Load the stride for the innermost loop again.
	639	iter.loadStride(maxRank);
	640	expr.loadStride(maxRank);
	641	}
	642
	643	return *this;
	644	}
	645
	646	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
	647	inline Array<T_numtype, N_rank>&
	648	Array<T_numtype, N_rank>::evaluateWithIndexTraversal1(_bz_ArrayExpr<T_expr>
	649	expr, T_update)
	650	{
	651	TinyVector<int,N_rank> index;
	652
	653	if (stride(firstRank) == 1)
	654	{
	655	T_numtype * _bz_restrict iter = data_;
	656	int last = ubound(firstRank);
	657
	658	for (index[0] = lbound(firstRank); index[0] <= last;
	659	++index[0])
	660	{
	661	iter[index[0]] = expr(index);
	662	}
	663	}
	664	else {
	665	ArrayIterator<T_numtype, N_rank> iter(*this);
	666	iter.loadStride(0);
	667	int last = ubound(firstRank);
	668
	669	for (index[0] = lbound(firstRank); index[0] <= last;
	670	++index[0])
	671	{
	672	T_update::update(const_cast<T_numtype>(iter.data()),
	673	expr(index));
	674	iter.advance();
	675	}
	676	}
	677
	678	return *this;
	679	}
	680
	681	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
	682	inline Array<T_numtype, N_rank>&
	683	Array<T_numtype, N_rank>::evaluateWithIndexTraversalN(_bz_ArrayExpr<T_expr>
	684	expr, T_update)
	685	{
	686	// Do a stack-type traversal for the destination array and use
	687	// index traversal for the source expression
	688
	689	const int maxRank = ordering(0);
	690	const int secondLastRank = ordering(1);
	691
	692	#ifdef BZ_DEBUG_TRAVERSE
	693	cout << "Index traversal: N_rank = " << N_rank << endl;
	694	cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
	695	<< endl;
	696	cout.flush();
	697	#endif
	698
	699	ArrayIterator<T_numtype, N_rank> iter(*this);
	700	for (int i=1; i < N_rank; ++i)
	701	iter.push(ordering(i));
	702
	703	iter.loadStride(maxRank);
	704
	705	TinyVector<int,N_rank> index, last;
	706
	707	index = storage_.base();
	708	last = storage_.base() + length_;
	709
	710	int lastLength = length(maxRank);
	711
	712	while (true) {
	713
	714	for (index[maxRank] = base(maxRank);
	715	index[maxRank] < last[maxRank];
	716	++index[maxRank])
	717	{
	718	#ifdef BZ_DEBUG_TRAVERSE
	719	#if 0
	720	cout << "(" << index[0] << "," << index[1] << ") " << endl;
	721	cout.flush();
	722	#endif
	723	#endif
	724
	725	T_update::update(const_cast<T_numtype>(iter.data()), expr(index));
	726	iter.advance();
	727	}
	728
	729	int j = 1;
	730	for (; j < N_rank; ++j)
	731	{
	732	iter.pop(ordering(j));
	733	iter.loadStride(ordering(j));
	734	iter.advance();
	735
	736	index[ordering(j-1)] = base(ordering(j-1));
	737	++index[ordering(j)];
	738	if (index[ordering(j)] != last[ordering(j)])
	739	break;
	740	}
	741
	742	if (j == N_rank)
	743	break;
	744
	745	for (; j > 0; --j)
	746	{
	747	iter.push(ordering(j));
	748	}
	749	iter.loadStride(maxRank);
	750	}
	751
	752	return *this;
	753	}
	754
	755	// Fast traversals require <set> from the ISO/ANSI C++ standard library
	756
	757	#ifdef BZ_HAVE_STD
	758
	759	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
	760	inline Array<T_numtype, N_rank>&
	761	Array<T_numtype, N_rank>::evaluateWithFastTraversal(
	762	const TraversalOrder<N_rank - 1>& order, _bz_ArrayExpr<T_expr> expr,
	763	T_update)
	764	{
	765	const int maxRank = ordering(0);
	766	const int secondLastRank = ordering(1);
	767
	768	#ifdef BZ_DEBUG_TRAVERSE
	769	cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
	770	<< endl;
	771	#endif
	772
	773	ArrayIterator<T_numtype, N_rank> iter(*this);
	774	iter.push(0);
	775	expr.push(0);
	776
	777	_bz_bool useUnitStride = iter.isUnitStride(maxRank)
	778	&& expr.isUnitStride(maxRank);
	779
	780	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	781	int commonStride = expr.suggestStride(maxRank);
	782	if (iter.suggestStride(maxRank) > commonStride)
	783	commonStride = iter.suggestStride(maxRank);
	784	bool useCommonStride = iter.isStride(maxRank,commonStride)
	785	&& expr.isStride(maxRank,commonStride);
	786	#else
	787	int commonStride = 1;
	788	bool useCommonStride = _bz_false;
	789	#endif
	790
	791	int lastLength = length(maxRank);
	792
	793	for (int i=0; i < order.length(); ++i)
	794	{
	795	iter.pop(0);
	796	expr.pop(0);
	797
	798	#ifdef BZ_DEBUG_TRAVERSE
	799	cerr << "Traversing: " << order[i] << endl;
	800	#endif
	801	// Position the iterator at the start of the next column
	802	for (int j=1; j < N_rank; ++j)
	803	{
	804	iter.loadStride(ordering(j));
	805	expr.loadStride(ordering(j));
	806
	807	int offset = order[i][j-1];
	808	iter.advance(offset);
	809	expr.advance(offset);
	810	}
	811
	812	iter.loadStride(maxRank);
	813	expr.loadStride(maxRank);
	814
	815	// Evaluate the expression along the column
	816
	817	if ((useUnitStride) \|\| (useCommonStride))
	818	{
	819	T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
	820	+ lastLength * commonStride;
	821
	822	#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
	823	int ubound = lastLength * commonStride;
	824	T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
	825
	826	if (commonStride == 1)
	827	{
	828	#ifndef BZ_ARRAY_FAST_TRAVERSAL_UNROLL
	829	for (int i=0; i < ubound; ++i)
	830	T_update::update(data[i], expr.fastRead(i));
	831	#else
	832	int n1 = ubound & 3;
	833	int i=0;
	834	for (; i < n1; ++i)
	835	T_update::update(data[i], expr.fastRead(i));
	836
	837	for (; i < ubound; i += 4)
	838	{
	839	T_update::update(data[i], expr.fastRead(i));
	840	T_update::update(data[i+1], expr.fastRead(i+1));
	841	T_update::update(data[i+2], expr.fastRead(i+2));
	842	T_update::update(data[i+3], expr.fastRead(i+3));
	843	}
	844	#endif // BZ_ARRAY_FAST_TRAVERSAL_UNROLL
	845	}
	846	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	847	else {
	848	for (int i=0; i < ubound; i += commonStride)
	849	T_update::update(data[i], expr.fastRead(i));
	850	}
	851	#endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	852
	853	iter.advance(lastLength * commonStride);
	854	expr.advance(lastLength * commonStride);
	855	#else // ! BZ_USE_FAST_READ_ARRAY_EXPR
	856	while (iter.data() != last)
	857	{
	858	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
	859	iter.advance(commonStride);
	860	expr.advance(commonStride);
	861	}
	862	#endif // BZ_USE_FAST_READ_ARRAY_EXPR
	863
	864	}
	865	else {
	866	// No common stride
	867
	868	T_numtype* _bz_restrict last = const_cast<T_numtype*>(iter.data())
	869	+ lastLength * stride(maxRank);
	870
	871	while (iter.data() != last)
	872	{
	873	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
	874	iter.advance();
	875	expr.advance();
	876	}
	877	}
	878	}
	879
	880	return *this;
	881	}
	882	#endif // BZ_HAVE_STD
	883
	884	#ifdef BZ_ARRAY_2D_NEW_STENCIL_TILING
	885
	886	#ifdef BZ_ARRAY_2D_STENCIL_TILING
	887
	888	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
	889	inline Array<T_numtype, N_rank>&
	890	Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
	891	expr, T_update)
	892	{
	893	const int minorRank = ordering(0);
	894	const int majorRank = ordering(1);
	895
	896	ArrayIterator<T_numtype, N_rank> iter(*this);
	897	iter.push(0);
	898	expr.push(0);
	899
	900	#ifdef BZ_2D_STENCIL_DEBUG
	901	int count = 0;
	902	#endif
	903
	904	_bz_bool useUnitStride = iter.isUnitStride(minorRank)
	905	&& expr.isUnitStride(minorRank);
	906
	907	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	908	int commonStride = expr.suggestStride(minorRank);
	909	if (iter.suggestStride(minorRank) > commonStride)
	910	commonStride = iter.suggestStride(minorRank);
	911	bool useCommonStride = iter.isStride(minorRank,commonStride)
	912	&& expr.isStride(minorRank,commonStride);
	913	#else
	914	int commonStride = 1;
	915	bool useCommonStride = _bz_false;
	916	#endif
	917
	918	// Determine if a common major stride exists
	919	int commonMajorStride = expr.suggestStride(majorRank);
	920	if (iter.suggestStride(majorRank) > commonMajorStride)
	921	commonMajorStride = iter.suggestStride(majorRank);
	922	bool haveCommonMajorStride = iter.isStride(majorRank,commonMajorStride)
	923	&& expr.isStride(majorRank,commonMajorStride);
	924
	925
	926	int maxi = length(majorRank);
	927	int maxj = length(minorRank);
	928
	929	const int tileHeight = 16, tileWidth = 3;
	930
	931	int bi, bj;
	932	for (bi=0; bi < maxi; bi += tileHeight)
	933	{
	934	int ni = bi + tileHeight;
	935	if (ni > maxi)
	936	ni = maxi;
	937
	938	// Move back to the beginning of the array
	939	iter.pop(0);
	940	expr.pop(0);
	941
	942	// Move to the start of this tile row
	943	iter.loadStride(majorRank);
	944	iter.advance(bi);
	945	expr.loadStride(majorRank);
	946	expr.advance(bi);
	947
	948	// Save this position
	949	iter.push(1);
	950	expr.push(1);
	951
	952	for (bj=0; bj < maxj; bj += tileWidth)
	953	{
	954	// Move to the beginning of the tile row
	955	iter.pop(1);
	956	expr.pop(1);
	957
	958	// Move to the top of the current tile (bi,bj)
	959	iter.loadStride(minorRank);
	960	iter.advance(bj);
	961	expr.loadStride(minorRank);
	962	expr.advance(bj);
	963
	964	if (bj + tileWidth <= maxj)
	965	{
	966	// Strip mining
	967
	968	if ((useUnitStride) && (haveCommonMajorStride))
	969	{
	970	int offset = 0;
	971	T_numtype* _bz_restrict data = const_cast<T_numtype*>
	972	(iter.data());
	973
	974	for (int i=bi; i < ni; ++i)
	975	{
	976	_bz_typename T_expr::T_numtype tmp1, tmp2, tmp3;
	977
	978	// Common subexpression elimination -- compilers
	979	// won't necessarily do this on their own.
	980	int t1 = offset+1;
	981	int t2 = offset+2;
	982
	983	tmp1 = expr.fastRead(offset);
	984	tmp2 = expr.fastRead(t1);
	985	tmp3 = expr.fastRead(t2);
	986
	987	T_update::update(data[0], tmp1);
	988	T_update::update(data[1], tmp2);
	989	T_update::update(data[2], tmp3);
	990
	991	offset += commonMajorStride;
	992	data += commonMajorStride;
	993
	994	#ifdef BZ_2D_STENCIL_DEBUG
	995	count += 3;
	996	#endif
	997	}
	998	}
	999	else {
	1000
	1001	for (int i=bi; i < ni; ++i)
	1002	{
	1003	iter.loadStride(minorRank);
	1004	expr.loadStride(minorRank);
	1005
	1006	// Loop through current row elements
	1007	T_update::update(const_cast<T_numtype>(iter.data()),
	1008	*expr);
	1009	iter.advance();
	1010	expr.advance();
	1011
	1012	T_update::update(const_cast<T_numtype>(iter.data()),
	1013	*expr);
	1014	iter.advance();
	1015	expr.advance();
	1016
	1017	T_update::update(const_cast<T_numtype>(iter.data()),
	1018	*expr);
	1019	iter.advance(-2);
	1020	expr.advance(-2);
	1021
	1022	iter.loadStride(majorRank);
	1023	expr.loadStride(majorRank);
	1024	iter.advance();
	1025	expr.advance();
	1026
	1027	#ifdef BZ_2D_STENCIL_DEBUG
	1028	count += 3;
	1029	#endif
	1030
	1031	}
	1032	}
	1033	}
	1034	else {
	1035
	1036	// This code handles partial tiles at the bottom of the
	1037	// array.
	1038
	1039	for (int j=bj; j < maxj; ++j)
	1040	{
	1041	iter.loadStride(majorRank);
	1042	expr.loadStride(majorRank);
	1043
	1044	for (int i=bi; i < ni; ++i)
	1045	{
	1046	T_update::update(const_cast<T_numtype>(iter.data()),
	1047	*expr);
	1048	iter.advance();
	1049	expr.advance();
	1050	#ifdef BZ_2D_STENCIL_DEBUG
	1051	++count;
	1052	#endif
	1053
	1054	}
	1055
	1056	// Move back to the top of this column
	1057	iter.advance(bi-ni);
	1058	expr.advance(bi-ni);
	1059
	1060	// Move over to the next column
	1061	iter.loadStride(minorRank);
	1062	expr.loadStride(minorRank);
	1063
	1064	iter.advance();
	1065	expr.advance();
	1066	}
	1067	}
	1068	}
	1069	}
	1070
	1071	#ifdef BZ_2D_STENCIL_DEBUG
	1072	cout << "BZ_2D_STENCIL_DEBUG: count = " << count << endl;
	1073	#endif
	1074
	1075	return *this;
	1076	}
	1077
	1078	#endif // BZ_ARRAY_2D_STENCIL_TILING
	1079	#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
	1080
	1081
	1082
	1083	#ifndef BZ_ARRAY_2D_NEW_STENCIL_TILING
	1084
	1085	#ifdef BZ_ARRAY_2D_STENCIL_TILING
	1086
	1087	template<class T_numtype, int N_rank> template<class T_expr, class T_update>
	1088	inline Array<T_numtype, N_rank>&
	1089	Array<T_numtype, N_rank>::evaluateWithTiled2DTraversal(_bz_ArrayExpr<T_expr>
	1090	expr, T_update)
	1091	{
	1092	const int minorRank = ordering(0);
	1093	const int majorRank = ordering(1);
	1094
	1095	const int blockSize = 16;
	1096
	1097	ArrayIterator<T_numtype, N_rank> iter(*this);
	1098	iter.push(0);
	1099	expr.push(0);
	1100
	1101	_bz_bool useUnitStride = iter.isUnitStride(minorRank)
	1102	&& expr.isUnitStride(minorRank);
	1103
	1104	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	1105	int commonStride = expr.suggestStride(minorRank);
	1106	if (iter.suggestStride(minorRank) > commonStride)
	1107	commonStride = iter.suggestStride(minorRank);
	1108	bool useCommonStride = iter.isStride(minorRank,commonStride)
	1109	&& expr.isStride(minorRank,commonStride);
	1110	#else
	1111	int commonStride = 1;
	1112	bool useCommonStride = _bz_false;
	1113	#endif
	1114
	1115	int maxi = length(majorRank);
	1116	int maxj = length(minorRank);
	1117
	1118	int bi, bj;
	1119	for (bi=0; bi < maxi; bi += blockSize)
	1120	{
	1121	int ni = bi + blockSize;
	1122	if (ni > maxi)
	1123	ni = maxi;
	1124
	1125	for (bj=0; bj < maxj; bj += blockSize)
	1126	{
	1127	int nj = bj + blockSize;
	1128	if (nj > maxj)
	1129	nj = maxj;
	1130
	1131	// Move to the beginning of the array
	1132	iter.pop(0);
	1133	expr.pop(0);
	1134
	1135	// Move to the beginning of the tile (bi,bj)
	1136	iter.loadStride(majorRank);
	1137	iter.advance(bi);
	1138	iter.loadStride(minorRank);
	1139	iter.advance(bj);
	1140
	1141	expr.loadStride(majorRank);
	1142	expr.advance(bi);
	1143	expr.loadStride(minorRank);
	1144	expr.advance(bj);
	1145
	1146	// Loop through tile rows
	1147	for (int i=bi; i < ni; ++i)
	1148	{
	1149	// Save the beginning of this tile row
	1150	iter.push(1);
	1151	expr.push(1);
	1152
	1153	// Load the minor stride
	1154	iter.loadStride(minorRank);
	1155	expr.loadStride(minorRank);
	1156
	1157	if (useUnitStride)
	1158	{
	1159	T_numtype* _bz_restrict data = const_cast<T_numtype*>
	1160	(iter.data());
	1161
	1162	int ubound = (nj-bj);
	1163	for (int j=0; j < ubound; ++j)
	1164	T_update::update(data[j], expr.fastRead(j));
	1165	}
	1166	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
	1167	else if (useCommonStride)
	1168	{
	1169	int ubound = (nj-bj) * commonStride;
	1170	T_numtype* _bz_restrict data = const_cast<T_numtype*>
	1171	(iter.data());
	1172
	1173	for (int j=0; j < ubound; j += commonStride)
	1174	T_update::update(data[j], expr.fastRead(j));
	1175	}
	1176	#endif
	1177	else {
	1178	for (int j=bj; j < nj; ++j)
	1179	{
	1180	// Loop through current row elements
	1181	T_update::update(const_cast<T_numtype>(iter.data()),
	1182	*expr);
	1183	iter.advance();
	1184	expr.advance();
	1185	}
	1186	}
	1187
	1188	// Move back to the beginning of the tile row, then
	1189	// move to the next row
	1190	iter.pop(1);
	1191	iter.loadStride(majorRank);
	1192	iter.advance(1);
	1193
	1194	expr.pop(1);
	1195	expr.loadStride(majorRank);
	1196	expr.advance(1);
	1197	}
	1198	}
	1199	}
	1200
	1201	return *this;
	1202	}
	1203	#endif // BZ_ARRAY_2D_STENCIL_TILING
	1204	#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
	1205
	1206	BZ_NAMESPACE_END
	1207
	1208	#endif // BZ_ARRAYEVAL_CC
	1209

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: Sophya/trunk/Poubelle/DPC:FitsIOServer/Blitz/blitz/array/eval.cc

Download in other formats: