From 64c2461add37e871ea79454fa94c48858f16016c Mon Sep 17 00:00:00 2001 From: "lisa.pizzo" Date: Wed, 29 Oct 2025 23:01:12 +0100 Subject: [PATCH] Upload files to "Sheet3" --- Sheet3/flops.c | 1197 +++++++++++++++++++++++++++++++++++++++++++++++ Sheet3/stream.c | 585 +++++++++++++++++++++++ 2 files changed, 1782 insertions(+) create mode 100644 Sheet3/flops.c create mode 100644 Sheet3/stream.c diff --git a/Sheet3/flops.c b/Sheet3/flops.c new file mode 100644 index 0000000..c7aa870 --- /dev/null +++ b/Sheet3/flops.c @@ -0,0 +1,1197 @@ +/* gcc -O3 -DUNIX -funroll-all-loops -o flops flops.c */ +/* ./flops */ + +/*--------------------- Start flops.c source code ----------------------*/ + +/*****************************/ +/* flops.c */ +/* Version 2.0, 18 Dec 1992 */ +/* Al Aburto */ +/* aburto@nosc.mil */ +/*****************************/ + +/* + Flops.c is a 'c' program which attempts to estimate your systems + floating-point 'MFLOPS' rating for the FADD, FSUB, FMUL, and FDIV + operations based on specific 'instruction mixes' (discussed below). + The program provides an estimate of PEAK MFLOPS performance by making + maximal use of register variables with minimal interaction with main + memory. The execution loops are all small so that they will fit in + any cache. Flops.c can be used along with Linpack and the Livermore + kernels (which exersize memory much more extensively) to gain further + insight into the limits of system performance. The flops.c execution + modules also include various percent weightings of FDIV's (from 0% to + 25% FDIV's) so that the range of performance can be obtained when + using FDIV's. FDIV's, being computationally more intensive than + FADD's or FMUL's, can impact performance considerably on some systems. + + Flops.c consists of 8 independent modules (routines) which, except for + module 2, conduct numerical integration of various functions. Module + 2, estimates the value of pi based upon the Maclaurin series expansion + of atan(1). MFLOPS ratings are provided for each module, but the + programs overall results are summerized by the MFLOPS(1), MFLOPS(2), + MFLOPS(3), and MFLOPS(4) outputs. + + The MFLOPS(1) result is identical to the result provided by all + previous versions of flops.c. It is based only upon the results from + modules 2 and 3. Two problems surfaced in using MFLOPS(1). First, it + was difficult to completely 'vectorize' the result due to the + recurrence of the 's' variable in module 2. This problem is addressed + in the MFLOPS(2) result which does not use module 2, but maintains + nearly the same weighting of FDIV's (9.2%) as in MFLOPS(1) (9.6%). + The second problem with MFLOPS(1) centers around the percentage of + FDIV's (9.6%) which was viewed as too high for an important class of + problems. This concern is addressed in the MFLOPS(3) result where NO + FDIV's are conducted at all. + + The number of floating-point instructions per iteration (loop) is + given below for each module executed: + + MODULE FADD FSUB FMUL FDIV TOTAL Comment + 1 7 0 6 1 14 7.1% FDIV's + 2 3 2 1 1 7 difficult to vectorize. + 3 6 2 9 0 17 0.0% FDIV's + 4 7 0 8 0 15 0.0% FDIV's + 5 13 0 15 1 29 3.4% FDIV's + 6 13 0 16 0 29 0.0% FDIV's + 7 3 3 3 3 12 25.0% FDIV's + 8 13 0 17 0 30 0.0% FDIV's + + A*2+3 21 12 14 5 52 A=5, MFLOPS(1), Same as + 40.4% 23.1% 26.9% 9.6% previous versions of the + flops.c program. Includes + only Modules 2 and 3, does + 9.6% FDIV's, and is not + easily vectorizable. + + 1+3+4 58 14 66 14 152 A=4, MFLOPS(2), New output + +5+6+ 38.2% 9.2% 43.4% 9.2% does not include Module 2, + A*7 but does 9.2% FDIV's. + + 1+3+4 62 5 74 5 146 A=0, MFLOPS(3), New output + +5+6+ 42.9% 3.4% 50.7% 3.4% does not include Module 2, + 7+8 but does 3.4% FDIV's. + + 3+4+6 39 2 50 0 91 A=0, MFLOPS(4), New output + +8 42.9% 2.2% 54.9% 0.0% does not include Module 2, + and does NO FDIV's. + + NOTE: Various timer routines are included as indicated below. The + timer routines, with some comments, are attached at the end + of the main program. + + NOTE: Please do not remove any of the printouts. + + EXAMPLE COMPILATION: + UNIX based systems + cc -DUNIX -O flops.c -o flops + cc -DUNIX -DROPT flops.c -o flops + cc -DUNIX -fast -O4 flops.c -o flops + . + . + . + etc. + + Al Aburto + aburto@nosc.mil +*/ + +/***************************************************************/ +/* Timer options. You MUST uncomment one of the options below */ +/* or compile, for example, with the '-DUNIX' option. */ +/***************************************************************/ +/* #define Amiga */ +/* #define UNIX */ +/* #define UNIX_Old */ +/* #define VMS */ +/* #define BORLAND_C */ +/* #define MSC */ +/* #define MAC */ +/* #define IPSC */ +/* #define FORTRAN_SEC */ +/* #define GTODay */ +/* #define CTimer */ +/* #define UXPM */ +/* #define MAC_TMgr */ +/* #define PARIX */ +/* #define POSIX */ +/* #define WIN32 */ +/* #define POSIX1 */ +/***********************/ + +#include +#include + /* 'Uncomment' the line below to run */ + /* with 'register double' variables */ + /* defined, or compile with the */ + /* '-DROPT' option. Don't need this if */ + /* registers used automatically, but */ + /* you might want to try it anyway. */ +/* #define ROPT */ +#include +#include + +#ifdef __APPLE__ +#include +double dtime(double p[]) +{ + static double timebase = 0.0; + static uint64_t timestart = 0; + uint64_t now; + double q; + + if (timebase == 0.0) { + mach_timebase_info_data_t tb; + mach_timebase_info(&tb); + timebase = tb.numer / (1e9 * tb.denom); + timestart = mach_absolute_time(); + p[2] = 0.0; + p[1] = 0.0; + return 0.0; + } + + q = p[2]; + now = mach_absolute_time(); + p[2] = (now - timestart) * timebase; // seconds + p[1] = p[2] - q; + return 0.0; +} +#else +#include +double dtime(double p[]) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + double now = ts.tv_sec + ts.tv_nsec * 1e-9; + double q = p[2]; + p[2] = now; + p[1] = p[2] - q; + return 0.0; +} +#endif + +double nulltime, TimeArray[3]; /* Variables needed for 'dtime()'. */ +double TLimit; /* Threshold to determine Number of */ + /* Loops to run. Fixed at 15.0 seconds.*/ + +double T[36]; /* Global Array used to hold timing */ + /* results and other information. */ + +double sa,sb,sc,sd,one,two,three; +double four,five,piref,piprg; +double scale,pierr; + +double A0 = 1.0; +double A1 = -0.1666666666671334; +double A2 = 0.833333333809067E-2; +double A3 = 0.198412715551283E-3; +double A4 = 0.27557589750762E-5; +double A5 = 0.2507059876207E-7; +double A6 = 0.164105986683E-9; + +double B0 = 1.0; +double B1 = -0.4999999999982; +double B2 = 0.4166666664651E-1; +double B3 = -0.1388888805755E-2; +double B4 = 0.24801428034E-4; +double B5 = -0.2754213324E-6; +double B6 = 0.20189405E-8; + +double C0 = 1.0; +double C1 = 0.99999999668; +double C2 = 0.49999995173; +double C3 = 0.16666704243; +double C4 = 0.4166685027E-1; +double C5 = 0.832672635E-2; +double C6 = 0.140836136E-2; +double C7 = 0.17358267E-3; +double C8 = 0.3931683E-4; + +double D1 = 0.3999999946405E-1; +double D2 = 0.96E-3; +double D3 = 0.1233153E-5; + +double E2 = 0.48E-3; +double E3 = 0.411051E-6; + +// void dtime(double p[]); + +int main() +{ + +#ifdef ROPT + register double s,u,v,w,x; +#else + double s,u,v,w,x; +#endif + + long loops, NLimit; + register long i, m, n; + + printf("\n"); + printf(" FLOPS C Program (Double Precision), V2.0 18 Dec 1992\n\n"); + + /****************************/ + loops = 15625; /* Initial number of loops. */ + /* DO NOT CHANGE! */ + /****************************/ + +/****************************************************/ +/* Set Variable Values. */ +/* T[1] references all timing results relative to */ +/* one million loops. */ +/* */ +/* The program will execute from 31250 to 512000000 */ +/* loops based on a runtime of Module 1 of at least */ +/* TLimit = 15.0 seconds. That is, a runtime of 15 */ +/* seconds for Module 1 is used to determine the */ +/* number of loops to execute. */ +/* */ +/* No more than NLimit = 512000000 loops are allowed*/ +/****************************************************/ + + T[1] = 1.0E+06/(double)loops; + + TLimit = 0.5; // CHANGED! IT WAS 15.0 + NLimit = 512000000; + + piref = 3.14159265358979324; + one = 1.0; + two = 2.0; + three = 3.0; + four = 4.0; + five = 5.0; + scale = one; + + printf(" Module Error RunTime MFLOPS\n"); + printf(" (usec)\n"); +/*************************/ +/* Initialize the timer. */ +/*************************/ + + dtime(TimeArray); + dtime(TimeArray); + +/*******************************************************/ +/* Module 1. Calculate integral of df(x)/f(x) defined */ +/* below. Result is ln(f(1)). There are 14 */ +/* double precision operations per loop */ +/* ( 7 +, 0 -, 6 *, 1 / ) that are included */ +/* in the timing. */ +/* 50.0% +, 00.0% -, 42.9% *, and 07.1% / */ +/*******************************************************/ + n = loops; + sa = 0.0; + + while ( sa < TLimit ) + { + n = 2 * n; + x = one / (double)n; /*********************/ + s = 0.0; /* Loop 1. */ + v = 0.0; /*********************/ + w = one; + + dtime(TimeArray); + for( i = 1 ; i <= n-1 ; i++ ) + { + v = v + w; + u = v * x; + s = s + (D1+u*(D2+u*D3))/(w+u*(D1+u*(E2+u*E3))); + } + dtime(TimeArray); + sa = TimeArray[1]; + + if ( n == NLimit ) break; + /* printf(" %10ld %12.5lf\n",n,sa); */ + } + + scale = 1.0E+06 / (double)n; + T[1] = scale; + +/****************************************/ +/* Estimate nulltime ('for' loop time). */ +/****************************************/ + dtime(TimeArray); + for( i = 1 ; i <= n-1 ; i++ ) + { + } + dtime(TimeArray); + nulltime = T[1] * TimeArray[1]; + if ( nulltime < 0.0 ) nulltime = 0.0; + + T[2] = T[1] * sa - nulltime; + + sa = (D1+D2+D3)/(one+D1+E2+E3); + sb = D1; + + T[3] = T[2] / 14.0; /*********************/ + sa = x * ( sa + sb + two * s ) / two; /* Module 1 Results. */ + sb = one / sa; /*********************/ + n = (long)( (double)( 40000 * (long)sb ) / scale ); + sc = sb - 25.2; + T[4] = one / T[3]; + /********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /********************/ + printf(" 1 %13.4le %10.4lf %10.4lf\n",sc,T[2],T[4]); + + m = n; + +/*******************************************************/ +/* Module 2. Calculate value of PI from Taylor Series */ +/* expansion of atan(1.0). There are 7 */ +/* double precision operations per loop */ +/* ( 3 +, 2 -, 1 *, 1 / ) that are included */ +/* in the timing. */ +/* 42.9% +, 28.6% -, 14.3% *, and 14.3% / */ +/*******************************************************/ + + s = -five; /********************/ + sa = -one; /* Loop 2. */ + /********************/ + dtime(TimeArray); + for ( i = 1 ; i <= m ; i++ ) + { + s = -s; + sa = sa + s; + } + dtime(TimeArray); + T[5] = T[1] * TimeArray[1]; + if ( T[5] < 0.0 ) T[5] = 0.0; + + sc = (double)m; + + u = sa; /*********************/ + v = 0.0; /* Loop 3. */ + w = 0.0; /*********************/ + x = 0.0; + + dtime(TimeArray); + for ( i = 1 ; i <= m ; i++) + { + s = -s; + sa = sa + s; + u = u + two; + x = x +(s - u); + v = v - s * u; + w = w + s / u; + } + dtime(TimeArray); + T[6] = T[1] * TimeArray[1]; + + T[7] = ( T[6] - T[5] ) / 7.0; /*********************/ + m = (long)( sa * x / sc ); /* PI Results */ + sa = four * w / five; /*********************/ + sb = sa + five / v; + sc = 31.25; + piprg = sb - sc / (v * v * v); + pierr = piprg - piref; + T[8] = one / T[7]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 2 %13.4le %10.4lf %10.4lf\n",pierr,T[6]-T[5],T[8]); + +/*******************************************************/ +/* Module 3. Calculate integral of sin(x) from 0.0 to */ +/* PI/3.0 using Trapazoidal Method. Result */ +/* is 0.5. There are 17 double precision */ +/* operations per loop (6 +, 2 -, 9 *, 0 /) */ +/* included in the timing. */ +/* 35.3% +, 11.8% -, 52.9% *, and 00.0% / */ +/*******************************************************/ + + x = piref / ( three * (double)m ); /*********************/ + s = 0.0; /* Loop 4. */ + v = 0.0; /*********************/ + + dtime(TimeArray); + for( i = 1 ; i <= m-1 ; i++ ) + { + v = v + one; + u = v * x; + w = u * u; + s = s + u * ((((((A6*w-A5)*w+A4)*w-A3)*w+A2)*w+A1)*w+one); + } + dtime(TimeArray); + T[9] = T[1] * TimeArray[1] - nulltime; + + u = piref / three; + w = u * u; + sa = u * ((((((A6*w-A5)*w+A4)*w-A3)*w+A2)*w+A1)*w+one); + + T[10] = T[9] / 17.0; /*********************/ + sa = x * ( sa + two * s ) / two; /* sin(x) Results. */ + sb = 0.5; /*********************/ + sc = sa - sb; + T[11] = one / T[10]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 3 %13.4le %10.4lf %10.4lf\n",sc,T[9],T[11]); + +/************************************************************/ +/* Module 4. Calculate Integral of cos(x) from 0.0 to PI/3 */ +/* using the Trapazoidal Method. Result is */ +/* sin(PI/3). There are 15 double precision */ +/* operations per loop (7 +, 0 -, 8 *, and 0 / ) */ +/* included in the timing. */ +/* 50.0% +, 00.0% -, 50.0% *, 00.0% / */ +/************************************************************/ + A3 = -A3; + A5 = -A5; + x = piref / ( three * (double)m ); /*********************/ + s = 0.0; /* Loop 5. */ + v = 0.0; /*********************/ + + dtime(TimeArray); + for( i = 1 ; i <= m-1 ; i++ ) + { + u = (double)i * x; + w = u * u; + s = s + w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + } + dtime(TimeArray); + T[12] = T[1] * TimeArray[1] - nulltime; + + u = piref / three; + w = u * u; + sa = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + + T[13] = T[12] / 15.0; /*******************/ + sa = x * ( sa + one + two * s ) / two; /* Module 4 Result */ + u = piref / three; /*******************/ + w = u * u; + sb = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+A0); + sc = sa - sb; + T[14] = one / T[13]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 4 %13.4le %10.4lf %10.4lf\n",sc,T[12],T[14]); + +/************************************************************/ +/* Module 5. Calculate Integral of tan(x) from 0.0 to PI/3 */ +/* using the Trapazoidal Method. Result is */ +/* ln(cos(PI/3)). There are 29 double precision */ +/* operations per loop (13 +, 0 -, 15 *, and 1 /)*/ +/* included in the timing. */ +/* 46.7% +, 00.0% -, 50.0% *, and 03.3% / */ +/************************************************************/ + + x = piref / ( three * (double)m ); /*********************/ + s = 0.0; /* Loop 6. */ + v = 0.0; /*********************/ + + dtime(TimeArray); + for( i = 1 ; i <= m-1 ; i++ ) + { + u = (double)i * x; + w = u * u; + v = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + s = s + v / (w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one); + } + dtime(TimeArray); + T[15] = T[1] * TimeArray[1] - nulltime; + + u = piref / three; + w = u * u; + sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + sa = sa / sb; + + T[16] = T[15] / 29.0; /*******************/ + sa = x * ( sa + two * s ) / two; /* Module 5 Result */ + sb = 0.6931471805599453; /*******************/ + sc = sa - sb; + T[17] = one / T[16]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 5 %13.4le %10.4lf %10.4lf\n",sc,T[15],T[17]); + +/************************************************************/ +/* Module 6. Calculate Integral of sin(x)*cos(x) from 0.0 */ +/* to PI/4 using the Trapazoidal Method. Result */ +/* is sin(PI/4)^2. There are 29 double precision */ +/* operations per loop (13 +, 0 -, 16 *, and 0 /)*/ +/* included in the timing. */ +/* 46.7% +, 00.0% -, 53.3% *, and 00.0% / */ +/************************************************************/ + + x = piref / ( four * (double)m ); /*********************/ + s = 0.0; /* Loop 7. */ + v = 0.0; /*********************/ + + dtime(TimeArray); + for( i = 1 ; i <= m-1 ; i++ ) + { + u = (double)i * x; + w = u * u; + v = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + s = s + v*(w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one); + } + dtime(TimeArray); + T[18] = T[1] * TimeArray[1] - nulltime; + + u = piref / four; + w = u * u; + sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + sa = sa * sb; + + T[19] = T[18] / 29.0; /*******************/ + sa = x * ( sa + two * s ) / two; /* Module 6 Result */ + sb = 0.25; /*******************/ + sc = sa - sb; + T[20] = one / T[19]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 6 %13.4le %10.4lf %10.4lf\n",sc,T[18],T[20]); + + +/*******************************************************/ +/* Module 7. Calculate value of the definite integral */ +/* from 0 to sa of 1/(x+1), x/(x*x+1), and */ +/* x*x/(x*x*x+1) using the Trapizoidal Rule.*/ +/* There are 12 double precision operations */ +/* per loop ( 3 +, 3 -, 3 *, and 3 / ) that */ +/* are included in the timing. */ +/* 25.0% +, 25.0% -, 25.0% *, and 25.0% / */ +/*******************************************************/ + + /*********************/ + s = 0.0; /* Loop 8. */ + w = one; /*********************/ + sa = 102.3321513995275; + v = sa / (double)m; + + dtime(TimeArray); + for ( i = 1 ; i <= m-1 ; i++) + { + x = (double)i * v; + u = x * x; + s = s - w / ( x + w ) - x / ( u + w ) - u / ( x * u + w ); + } + dtime(TimeArray); + T[21] = T[1] * TimeArray[1] - nulltime; + /*********************/ + /* Module 7 Results */ + /*********************/ + T[22] = T[21] / 12.0; + x = sa; + u = x * x; + sa = -w - w / ( x + w ) - x / ( u + w ) - u / ( x * u + w ); + sa = 18.0 * v * (sa + two * s ); + + m = -2000 * (long)sa; + m = (long)( (double)m / scale ); + + sc = sa + 500.2; + T[23] = one / T[22]; + /********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /********************/ + printf(" 7 %13.4le %10.4lf %10.4lf\n",sc,T[21],T[23]); + +/************************************************************/ +/* Module 8. Calculate Integral of sin(x)*cos(x)*cos(x) */ +/* from 0 to PI/3 using the Trapazoidal Method. */ +/* Result is (1-cos(PI/3)^3)/3. There are 30 */ +/* double precision operations per loop included */ +/* in the timing: */ +/* 13 +, 0 -, 17 * 0 / */ +/* 46.7% +, 00.0% -, 53.3% *, and 00.0% / */ +/************************************************************/ + + x = piref / ( three * (double)m ); /*********************/ + s = 0.0; /* Loop 9. */ + v = 0.0; /*********************/ + + dtime(TimeArray); + for( i = 1 ; i <= m-1 ; i++ ) + { + u = (double)i * x; + w = u * u; + v = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + s = s + v*v*u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + } + dtime(TimeArray); + T[24] = T[1] * TimeArray[1] - nulltime; + + u = piref / three; + w = u * u; + sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + sa = sa * sb * sb; + + T[25] = T[24] / 30.0; /*******************/ + sa = x * ( sa + two * s ) / two; /* Module 8 Result */ + sb = 0.29166666666666667; /*******************/ + sc = sa - sb; + T[26] = one / T[25]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 8 %13.4le %10.4lf %10.4lf\n",sc,T[24],T[26]); + +/**************************************************/ +/* MFLOPS(1) output. This is the same weighting */ +/* used for all previous versions of the flops.c */ +/* program. Includes Modules 2 and 3 only. */ +/**************************************************/ + T[27] = ( five * (T[6] - T[5]) + T[9] ) / 52.0; + T[28] = one / T[27]; + +/**************************************************/ +/* MFLOPS(2) output. This output does not include */ +/* Module 2, but it still does 9.2% FDIV's. */ +/**************************************************/ + T[29] = T[2] + T[9] + T[12] + T[15] + T[18]; + T[29] = (T[29] + four * T[21]) / 152.0; + T[30] = one / T[29]; + +/**************************************************/ +/* MFLOPS(3) output. This output does not include */ +/* Module 2, but it still does 3.4% FDIV's. */ +/**************************************************/ + T[31] = T[2] + T[9] + T[12] + T[15] + T[18]; + T[31] = (T[31] + T[21] + T[24]) / 146.0; + T[32] = one / T[31]; + +/**************************************************/ +/* MFLOPS(4) output. This output does not include */ +/* Module 2, and it does NO FDIV's. */ +/**************************************************/ + T[33] = (T[9] + T[12] + T[18] + T[24]) / 91.0; + T[34] = one / T[33]; + + + printf("\n"); + printf(" Iterations = %10ld\n",m); + printf(" NullTime (usec) = %10.4lf\n",nulltime); + printf(" MFLOPS(1) = %10.4lf\n",T[28]); + printf(" MFLOPS(2) = %10.4lf\n",T[30]); + printf(" MFLOPS(3) = %10.4lf\n",T[32]); + printf(" MFLOPS(4) = %10.4lf\n\n",T[34]); + return 0; +} + +/*****************************************************/ +/* Various timer routines. */ +/* Al Aburto, aburto@nosc.mil, 18 Feb 1997 */ +/* */ +/* dtime(p) outputs the elapsed time seconds in p[1] */ +/* from a call of dtime(p) to the next call of */ +/* dtime(p). Use CAUTION as some of these routines */ +/* will mess up when timing across the hour mark!!! */ +/* */ +/* For timing I use the 'user' time whenever */ +/* possible. Using 'user+sys' time is a separate */ +/* issue. */ +/* */ +/* Example Usage: */ +/* [Timer options added here] */ +/* double RunTime, TimeArray[3]; */ +/* main() */ +/* { */ +/* dtime(TimeArray); */ +/* [routine to time] */ +/* dtime(TimeArray); */ +/* RunTime = TimeArray[1]; */ +/* } */ +/* [Timer code added here] */ +/*****************************************************/ + +/******************************/ +/* Timer code. */ +/******************************/ + +/*******************/ +/* Amiga dtime() */ +/*******************/ +#ifdef Amiga +#include +#define HZ 50 + +dtime(p) +double p[]; +{ + double q; + + struct tt { + long days; + long minutes; + long ticks; + } tt; + + q = p[2]; + + DateStamp(&tt); + + p[2] = ( (double)(tt.ticks + (tt.minutes * 60L * 50L)) ) / (double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/*****************************************************/ +/* UNIX dtime(). This is the preferred UNIX timer. */ +/* Provided by: Markku Kolkka, mk59200@cc.tut.fi */ +/* HP-UX Addition by: Bo Thide', bt@irfu.se */ +/*****************************************************/ +#ifdef UNIX +#include +#include + +#ifdef hpux +#include +#define getrusage(a,b) syscall(SYS_getrusage,a,b) +#endif + +struct rusage rusage; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + getrusage(RUSAGE_SELF,&rusage); + + p[2] = (double)(rusage.ru_utime.tv_sec); + p[2] = p[2] + (double)(rusage.ru_utime.tv_usec) * 1.0e-06; + p[1] = p[2] - q; + + return 0; +} +#endif + +/***************************************************/ +/* UNIX_Old dtime(). This is the old UNIX timer. */ +/* Use only if absolutely necessary as HZ may be */ +/* ill defined on your system. */ +/***************************************************/ +#ifdef UNIX_Old +#include +#include +#include + +#ifndef HZ +#define HZ 60 +#endif + +struct tms tms; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + times(&tms); + + p[2] = (double)(tms.tms_utime) / (double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/*********************************************************/ +/* VMS dtime() for VMS systems. */ +/* Provided by: RAMO@uvphys.phys.UVic.CA */ +/* Some people have run into problems with this timer. */ +/*********************************************************/ +#ifdef VMS +#include time + +#ifndef HZ +#define HZ 100 +#endif + +struct tbuffer_t + { + int proc_user_time; + int proc_system_time; + int child_user_time; + int child_system_time; + }; + +struct tbuffer_t tms; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + times(&tms); + + p[2] = (double)(tms.proc_user_time) / (double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/******************************/ +/* BORLAND C dtime() for DOS */ +/******************************/ +#ifdef BORLAND_C +#include +#include +#include + +#define HZ 100 +struct time tnow; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + gettime(&tnow); + + p[2] = 60.0 * (double)(tnow.ti_min); + p[2] = p[2] + (double)(tnow.ti_sec); + p[2] = p[2] + (double)(tnow.ti_hund)/(double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/**************************************/ +/* Microsoft C (MSC) dtime() for DOS */ +/**************************************/ +#ifdef MSC +#include +#include + +#define HZ CLOCKS_PER_SEC +clock_t tnow; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + tnow = clock(); + + p[2] = (double)tnow / (double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/*************************************/ +/* Macintosh (MAC) Think C dtime() */ +/*************************************/ +#ifdef MAC +#include + +#define HZ 60 + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + p[2] = (double)clock() / (double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/************************************************************/ +/* iPSC/860 (IPSC) dtime() for i860. */ +/* Provided by: Dan Yergeau, yergeau@gloworm.Stanford.EDU */ +/************************************************************/ +#ifdef IPSC +extern double dclock(); + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + p[2] = dclock(); + p[1] = p[2] - q; + + return 0; +} +#endif + +/**************************************************/ +/* FORTRAN dtime() for Cray type systems. */ +/* This is the preferred timer for Cray systems. */ +/**************************************************/ +#ifdef FORTRAN_SEC + +fortran double second(); + +dtime(p) +double p[]; +{ + double q,v; + + q = p[2]; + + second(&v); + p[2] = v; + p[1] = p[2] - q; + + return 0; +} +#endif + +/***********************************************************/ +/* UNICOS C dtime() for Cray UNICOS systems. Don't use */ +/* unless absolutely necessary as returned time includes */ +/* 'user+system' time. Provided by: R. Mike Dority, */ +/* dority@craysea.cray.com */ +/***********************************************************/ +#ifdef CTimer +#include + +dtime(p) +double p[]; +{ + double q; + clock_t clock(void); + + q = p[2]; + + p[2] = (double)clock() / (double)CLOCKS_PER_SEC; + p[1] = p[2] - q; + + return 0; +} +#endif + +/********************************************/ +/* Another UNIX timer using gettimeofday(). */ +/* However, getrusage() is preferred. */ +/********************************************/ +#ifdef GTODay +#include + +struct timeval tnow; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + gettimeofday(&tnow,NULL); + p[2] = (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6; + p[1] = p[2] - q; + + return 0; +} +#endif + +/*****************************************************/ +/* Fujitsu UXP/M timer. */ +/* Provided by: Mathew Lim, ANUSF, M.Lim@anu.edu.au */ +/*****************************************************/ +#ifdef UXPM +#include +#include +struct tmsu rusage; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + timesu(&rusage); + + p[2] = (double)(rusage.tms_utime) * 1.0e-06; + p[1] = p[2] - q; + + return 0; +} +#endif + +/**********************************************/ +/* Macintosh (MAC_TMgr) Think C dtime() */ +/* requires Think C Language Extensions or */ +/* #include in the prefix */ +/* provided by Francis H Schiffer 3rd (fhs) */ +/* skipschiffer@genie.geis.com */ +/**********************************************/ +#ifdef MAC_TMgr +#include +#include + +static TMTask mgrTimer; +static Boolean mgrInited = FALSE; +static double mgrClock; + +#define RMV_TIMER RmvTime( (QElemPtr)&mgrTimer ) +#define MAX_TIME 1800000000L +/* MAX_TIME limits time between calls to */ +/* dtime( ) to no more than 30 minutes */ +/* this limitation could be removed by */ +/* creating a completion routine to sum */ +/* 30 minute segments (fhs 1994 feb 9) */ + +static void Remove_timer( ) +{ + RMV_TIMER; + mgrInited = FALSE; +} + +int dtime( p ) +double p[]; +{ + if ( mgrInited ) { + RMV_TIMER; + mgrClock += (MAX_TIME + mgrTimer.tmCount)*1.0e-6; + } else { + if ( _atexit( &Remove_timer ) == 0 ) mgrInited = TRUE; + mgrClock = 0.0; + } + + p[1] = mgrClock - p[2]; + p[2] = mgrClock; + if ( mgrInited ) { + mgrTimer.tmAddr = NULL; + mgrTimer.tmCount = 0; + mgrTimer.tmWakeUp = 0; + mgrTimer.tmReserved = 0; + InsTime( (QElemPtr)&mgrTimer ); + PrimeTime( (QElemPtr)&mgrTimer, -MAX_TIME ); + } + return( 0 ); +} +#endif + +/***********************************************************/ +/* Parsytec GCel timer. */ +/* Provided by: Georg Wambach, gw@informatik.uni-koeln.de */ +/***********************************************************/ +#ifdef PARIX +#include + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + p[2] = (double) (TimeNowHigh()) / (double) CLK_TCK_HIGH; + p[1] = p[2] - q; + + return 0; +} +#endif + +/************************************************/ +/* Sun Solaris POSIX dtime() routine */ +/* Provided by: Case Larsen, CTLarsen@lbl.gov */ +/************************************************/ +#ifdef POSIX +#include +#include +#include + +#ifdef __hpux +#include +#define getrusage(a,b) syscall(SYS_getrusage,a,b) +#endif + +struct rusage rusage; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + getrusage(RUSAGE_SELF,&rusage); + + p[2] = (double)(rusage.ru_utime.tv_sec); + p[2] = p[2] + (double)(rusage.ru_utime.tv_nsec) * 1.0e-09; + p[1] = p[2] - q; + + return 0; +} +#endif + +/****************************************************/ +/* Windows NT (32 bit) dtime() routine */ +/* Provided by: Piers Haken, piersh@microsoft.com */ +/****************************************************/ +#ifdef WIN32 +#include + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + p[2] = (double)GetTickCount() * 1.0e-03; + p[1] = p[2] - q; + + return 0; +} +#endif + +/*****************************************************/ +/* Time according to POSIX.1 - */ +/* Ref: "POSIX Programmer's Guide" O'Reilly & Assoc.*/ +/*****************************************************/ +#ifdef POSIX1 +#define _POSIX_SOURCE 1 +#include +#include +#include + +struct tms tms; + +dtime(p) +double p[]; +{ + double q; + times(&tms); + q = p[2]; + p[2] = (double)tms.tms_utime / (double)CLK_TCK; + p[1] = p[2] - q; + return 0; +} +#endif + +/*------ End flops.c code, say good night Jan! (Sep 1992) ------*/ diff --git a/Sheet3/stream.c b/Sheet3/stream.c new file mode 100644 index 0000000..b9a2cee --- /dev/null +++ b/Sheet3/stream.c @@ -0,0 +1,585 @@ +/*-----------------------------------------------------------------------*/ +/* Program: STREAM */ +/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ +/* Original code developed by John D. McCalpin */ +/* Programmers: John D. McCalpin */ +/* Joe R. Zagar */ +/* */ +/* This program measures memory transfer rates in MB/s for simple */ +/* computational kernels coded in C. */ +/*-----------------------------------------------------------------------*/ +/* Copyright 1991-2013: John D. McCalpin */ +/*-----------------------------------------------------------------------*/ +/* License: */ +/* 1. You are free to use this program and/or to redistribute */ +/* this program. */ +/* 2. You are free to modify this program for your own use, */ +/* including commercial use, subject to the publication */ +/* restrictions in item 3. */ +/* 3. You are free to publish results obtained from running this */ +/* program, or from works that you derive from this program, */ +/* with the following limitations: */ +/* 3a. In order to be referred to as "STREAM benchmark results", */ +/* published results must be in conformance to the STREAM */ +/* Run Rules, (briefly reviewed below) published at */ +/* http://www.cs.virginia.edu/stream/ref.html */ +/* and incorporated herein by reference. */ +/* As the copyright holder, John McCalpin retains the */ +/* right to determine conformity with the Run Rules. */ +/* 3b. Results based on modified source code or on runs not in */ +/* accordance with the STREAM Run Rules must be clearly */ +/* labelled whenever they are published. Examples of */ +/* proper labelling include: */ +/* "tuned STREAM benchmark results" */ +/* "based on a variant of the STREAM benchmark code" */ +/* Other comparable, clear, and reasonable labelling is */ +/* acceptable. */ +/* 3c. Submission of results to the STREAM benchmark web site */ +/* is encouraged, but not required. */ +/* 4. Use of this program or creation of derived works based on this */ +/* program constitutes acceptance of these licensing restrictions. */ +/* 5. Absolutely no warranty is expressed or implied. */ +/*-----------------------------------------------------------------------*/ +# include +# include +# include +# include +# include +# include + +/*----------------------------------------------------------------------- + * INSTRUCTIONS: + * + * 1) STREAM requires different amounts of memory to run on different + * systems, depending on both the system cache size(s) and the + * granularity of the system timer. + * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) + * to meet *both* of the following criteria: + * (a) Each array must be at least 4 times the size of the + * available cache memory. I don't worry about the difference + * between 10^6 and 2^20, so in practice the minimum array size + * is about 3.8 times the cache size. + * Example 1: One Xeon E3 with 8 MB L3 cache + * STREAM_ARRAY_SIZE should be >= 4 million, giving + * an array size of 30.5 MB and a total memory requirement + * of 91.5 MB. + * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) + * STREAM_ARRAY_SIZE should be >= 20 million, giving + * an array size of 153 MB and a total memory requirement + * of 458 MB. + * (b) The size should be large enough so that the 'timing calibration' + * output by the program is at least 20 clock-ticks. + * Example: most versions of Windows have a 10 millisecond timer + * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. + * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. + * This means the each array must be at least 1 GB, or 128M elements. + * + * Version 5.10 increases the default array size from 2 million + * elements to 10 million elements in response to the increasing + * size of L3 caches. The new default size is large enough for caches + * up to 20 MB. + * Version 5.10 changes the loop index variables from "register int" + * to "ssize_t", which allows array indices >2^32 (4 billion) + * on properly configured 64-bit systems. Additional compiler options + * (such as "-mcmodel=medium") may be required for large memory runs. + * + * Array size can be set at compile time without modifying the source + * code for the (many) compilers that support preprocessor definitions + * on the compile line. E.g., + * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M + * will override the default size of 10M with a new size of 100M elements + * per array. + */ +#ifndef STREAM_ARRAY_SIZE +# define STREAM_ARRAY_SIZE 10000000 +#endif + +/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result + * for any iteration after the first, therefore the minimum value + * for NTIMES is 2. + * There are no rules on maximum allowable values for NTIMES, but + * values larger than the default are unlikely to noticeably + * increase the reported performance. + * NTIMES can also be set on the compile line without changing the source + * code using, for example, "-DNTIMES=7". + */ +#ifdef NTIMES +#if NTIMES<=1 +# define NTIMES 10 +#endif +#endif +#ifndef NTIMES +# define NTIMES 10 +#endif + +/* Users are allowed to modify the "OFFSET" variable, which *may* change the + * relative alignment of the arrays (though compilers may change the + * effective offset by making the arrays non-contiguous on some systems). + * Use of non-zero values for OFFSET can be especially helpful if the + * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. + * OFFSET can also be set on the compile line without changing the source + * code using, for example, "-DOFFSET=56". + */ +#ifndef OFFSET +# define OFFSET 0 +#endif + +/* + * 3) Compile the code with optimization. Many compilers generate + * unreasonably bad code before the optimizer tightens things up. + * If the results are unreasonably good, on the other hand, the + * optimizer might be too smart for me! + * + * For a simple single-core version, try compiling with: + * cc -O stream.c -o stream + * This is known to work on many, many systems.... + * + * To use multiple cores, you need to tell the compiler to obey the OpenMP + * directives in the code. This varies by compiler, but a common example is + * gcc -O -fopenmp stream.c -o stream_omp + * The environment variable OMP_NUM_THREADS allows runtime control of the + * number of threads/cores used when the resulting "stream_omp" program + * is executed. + * + * To run with single-precision variables and arithmetic, simply add + * -DSTREAM_TYPE=float + * to the compile line. + * Note that this changes the minimum array sizes required --- see (1) above. + * + * The preprocessor directive "TUNED" does not do much -- it simply causes the + * code to call separate functions to execute each kernel. Trivial versions + * of these functions are provided, but they are *not* tuned -- they just + * provide predefined interfaces to be replaced with tuned code. + * + * + * 4) Optional: Mail the results to mccalpin@cs.virginia.edu + * Be sure to include info that will help me understand: + * a) the computer hardware configuration (e.g., processor model, memory type) + * b) the compiler name/version and compilation flags + * c) any run-time information (such as OMP_NUM_THREADS) + * d) all of the output from the test case. + * + * Thanks! + * + *-----------------------------------------------------------------------*/ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +#ifndef STREAM_TYPE +#define STREAM_TYPE double +#endif + +static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], + b[STREAM_ARRAY_SIZE+OFFSET], + c[STREAM_ARRAY_SIZE+OFFSET]; + +static double avgtime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE + }; + +extern double mysecond(); +extern void checkSTREAMresults(); +#ifdef TUNED +extern void tuned_STREAM_Copy(); +extern void tuned_STREAM_Scale(STREAM_TYPE scalar); +extern void tuned_STREAM_Add(); +extern void tuned_STREAM_Triad(STREAM_TYPE scalar); +#endif +#ifdef _OPENMP +extern int omp_get_num_threads(); +#endif +int +main() + { + int quantum, checktick(); + int BytesPerWord; + int k; + ssize_t j; + STREAM_TYPE scalar; + double t, times[4][NTIMES]; + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + printf("STREAM version $Revision: 5.10 $\n"); + printf(HLINE); + BytesPerWord = sizeof(STREAM_TYPE); + printf("This system uses %d bytes per array element.\n", + BytesPerWord); + + printf(HLINE); +#ifdef N + printf("***** WARNING: ******\n"); + printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); + printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); + printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); + printf("***** WARNING: ******\n"); +#endif + + printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); + printf("Memory per array = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); + printf("Total memory required = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); + printf("Each kernel will be executed %d times.\n", NTIMES); + printf(" The *best* time for each kernel (excluding the first iteration)\n"); + printf(" will be used to compute the reported bandwidth.\n"); + +#ifdef _OPENMP + printf(HLINE); +#pragma omp parallel + { +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); + } + } +#endif + +#ifdef _OPENMP + k = 0; +#pragma omp parallel +#pragma omp atomic + k++; + printf ("Number of Threads counted = %i\n",k); +#endif + + /* Get initial value for system clock. */ +#pragma omp parallel for + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else { + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + quantum = 1; + } + + t = mysecond(); +#pragma omp parallel for + for (j = 0; j < STREAM_ARRAY_SIZE; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + scalar = 3.0; + for (k=0; k + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +#ifndef abs +#define abs(a) ((a) >= 0 ? (a) : -(a)) +#endif +void checkSTREAMresults () +{ + STREAM_TYPE aj,bj,cj,scalar; + STREAM_TYPE aSumErr,bSumErr,cSumErr; + STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; + double epsilon; + ssize_t j; + int k,ierr,err; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (k=0; k epsilon) { + err++; + printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,aj,a[j],abs((aj-a[j])/aAvgErr)); + } +#endif + } + } + printf(" For array a[], %d errors were found.\n",ierr); + } + if (abs(bAvgErr/bj) > epsilon) { + err++; + printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,bj,b[j],abs((bj-b[j])/bAvgErr)); + } +#endif + } + } + printf(" For array b[], %d errors were found.\n",ierr); + } + if (abs(cAvgErr/cj) > epsilon) { + err++; + printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,cj,c[j],abs((cj-c[j])/cAvgErr)); + } +#endif + } + } + printf(" For array c[], %d errors were found.\n",ierr); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); + } +#ifdef VERBOSE + printf ("Results Validation Verbose Results: \n"); + printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); + printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); + printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); +#endif +} + +#ifdef TUNED +/* stubs for "tuned" versions of the kernels */ +void tuned_STREAM_Copy() +{ + ssize_t j; +#pragma omp parallel for + for (j=0; j