Figure 1: What possible compiler optimizations might look like if done by hand
#define N 1000 /* Kernel with hoisting of invariant done by hand. */ void with_hoisting( float * restrict a, float * restrict b, int n, int j ) { float t0 = (b[j+N/4] + b[j-N/4]) * 0.5f; int i; for( i=0; i<n; ++i ) a[i] = t0; } /* * Kernel with software pipelining done by hand. * The optimal pipelining depends upon the target machine. * The example here is only one such way. * It's mighty peculiar to do it in this example * since b[..] can be hoisted, * but nonetheless at least one compiler did something * similar to this. */ void with_software_pipelining( float * restrict a, float * restrict b, int n, int j ) { int i; float t0, t1, t2, t3; if( 3 <= n ) { /* prologue for pipelined loop */ t0 = b[j+(N/4)]; /* Part of iteration i=0 */ t1 = b[j-(N/4)]; /* " " " i=0 */ t2 = t0 + t1; /* " " " i=0 */ t0 = b[j+(N/4)]; /* " " " i=1 */ t1 = b[j-(N/4)]; /* " " " i=1 */ t3 = 0.5f * t2; /* Part of iteration i=0 */ t2 = t0 + t1; /* " " " i=1 */ t0 = b[j+(N/4)]; /* " " " i=2 */ t1 = b[j-(N/4)]; /* " " " i=2 */ /* The pipelined loop */ for( i=3; i<n; ++i ) { /* Next five statements could be evaluated in single step. */ a[i-3] = t3; t3 = 0.5f * t2; t2 = t0 + t1; t0 = b[j+(N/4)]; t1 = b[j-(N/4)]; } /* epilogue for pipelined loop */ a[n-3] = t3; /* Part of iteration i=n-3 */ t3 = 0.5f * t2; /* " " " i=n-2 */ t2 = t0 + t1; /* " " " i=n-1 */ a[n-2] = t3; /* " " " i=n-2 */ t3 = 0.5f * t2; /* " " " i=n-1 */ a[n-1] = t3; /* " " " i=n-1 */ } else { // Not enough iterations to pipeline the loop for( i=0; i<n; ++i ) a[i] = (b[j+N/4] + b[j-N/4]) * 0.5f; } }