From 64c2461add37e871ea79454fa94c48858f16016c Mon Sep 17 00:00:00 2001
From: "lisa.pizzo" <lisa.pizzo@noreply.imsc.uni-graz.at>
Date: Wed, 29 Oct 2025 23:01:12 +0100
Subject: [PATCH] Upload files to "Sheet3"

---
 Sheet3/flops.c  | 1197 +++++++++++++++++++++++++++++++++++++++++++++++
 Sheet3/stream.c |  585 +++++++++++++++++++++++
 2 files changed, 1782 insertions(+)
 create mode 100644 Sheet3/flops.c
 create mode 100644 Sheet3/stream.c

diff --git a/Sheet3/flops.c b/Sheet3/flops.c
new file mode 100644
index 0000000..c7aa870
--- /dev/null
+++ b/Sheet3/flops.c
@@ -0,0 +1,1197 @@
+/*  gcc  -O3 -DUNIX -funroll-all-loops -o flops flops.c      */
+/*  ./flops                              */
+
+/*--------------------- Start flops.c source code ----------------------*/
+
+/*****************************/
+/*          flops.c          */
+/* Version 2.0,  18 Dec 1992 */
+/*         Al Aburto         */
+/*      aburto@nosc.mil      */
+/*****************************/
+
+/*
+   Flops.c is a 'c' program which attempts to estimate your systems
+   floating-point 'MFLOPS' rating for the FADD, FSUB, FMUL, and FDIV
+   operations based on specific 'instruction mixes' (discussed below).
+   The program provides an estimate of PEAK MFLOPS performance by making
+   maximal use of register variables with minimal interaction with main
+   memory. The execution loops are all small so that they will fit in
+   any cache. Flops.c can be used along with Linpack and the Livermore
+   kernels (which exersize memory much more extensively) to gain further
+   insight into the limits of system performance. The flops.c execution
+   modules also include various percent weightings of FDIV's (from 0% to
+   25% FDIV's) so that the range of performance can be obtained when
+   using FDIV's. FDIV's, being computationally more intensive than
+   FADD's or FMUL's, can impact performance considerably on some systems.
+
+   Flops.c consists of 8 independent modules (routines) which, except for
+   module 2, conduct numerical integration of various functions. Module
+   2, estimates the value of pi based upon the Maclaurin series expansion
+   of atan(1). MFLOPS ratings are provided for each module, but the
+   programs overall results are summerized by the MFLOPS(1), MFLOPS(2),
+   MFLOPS(3), and MFLOPS(4) outputs.
+
+   The MFLOPS(1) result is identical to the result provided by all
+   previous versions of flops.c. It is based only upon the results from
+   modules 2 and 3. Two problems surfaced in using MFLOPS(1). First, it
+   was difficult to completely 'vectorize' the result due to the
+   recurrence of the 's' variable in module 2. This problem is addressed
+   in the MFLOPS(2) result which does not use module 2, but maintains
+   nearly the same weighting of FDIV's (9.2%) as in MFLOPS(1) (9.6%).
+   The second problem with MFLOPS(1) centers around the percentage of
+   FDIV's (9.6%) which was viewed as too high for an important class of
+   problems. This concern is addressed in the MFLOPS(3) result where NO
+   FDIV's are conducted at all.
+
+   The number of floating-point instructions per iteration (loop) is
+   given below for each module executed:
+
+   MODULE   FADD   FSUB   FMUL   FDIV   TOTAL  Comment
+     1        7      0      6      1      14   7.1%  FDIV's
+     2        3      2      1      1       7   difficult to vectorize.
+     3        6      2      9      0      17   0.0%  FDIV's
+     4        7      0      8      0      15   0.0%  FDIV's
+     5       13      0     15      1      29   3.4%  FDIV's
+     6       13      0     16      0      29   0.0%  FDIV's
+     7        3      3      3      3      12   25.0% FDIV's
+     8       13      0     17      0      30   0.0%  FDIV's
+
+   A*2+3     21     12     14      5      52   A=5, MFLOPS(1), Same as
+	   40.4%  23.1%  26.9%  9.6%          previous versions of the
+						flops.c program. Includes
+						only Modules 2 and 3, does
+						9.6% FDIV's, and is not
+						easily vectorizable.
+
+   1+3+4     58     14     66     14     152   A=4, MFLOPS(2), New output
+   +5+6+    38.2%  9.2%   43.4%  9.2%          does not include Module 2,
+   A*7                                         but does 9.2% FDIV's.
+
+   1+3+4     62      5     74      5     146   A=0, MFLOPS(3), New output
+   +5+6+    42.9%  3.4%   50.7%  3.4%          does not include Module 2,
+   7+8                                         but does 3.4% FDIV's.
+
+   3+4+6     39      2     50      0      91   A=0, MFLOPS(4), New output
+   +8       42.9%  2.2%   54.9%  0.0%          does not include Module 2,
+						and does NO FDIV's.
+
+   NOTE: Various timer routines are included as indicated below. The
+	timer routines, with some comments, are attached at the end
+	of the main program.
+
+   NOTE: Please do not remove any of the printouts.
+
+   EXAMPLE COMPILATION:
+   UNIX based systems
+       cc -DUNIX -O flops.c -o flops
+       cc -DUNIX -DROPT flops.c -o flops
+       cc -DUNIX -fast -O4 flops.c -o flops
+       .
+       .
+       .
+     etc.
+
+   Al Aburto
+   aburto@nosc.mil
+*/
+
+/***************************************************************/
+/* Timer options. You MUST uncomment one of the options below  */
+/* or compile, for example, with the '-DUNIX' option.          */
+/***************************************************************/
+/* #define Amiga       */
+/* #define UNIX        */
+/* #define UNIX_Old    */
+/* #define VMS         */
+/* #define BORLAND_C   */
+/* #define MSC         */
+/* #define MAC         */
+/* #define IPSC        */
+/* #define FORTRAN_SEC */
+/* #define GTODay      */
+/* #define CTimer      */
+/* #define UXPM        */
+/* #define MAC_TMgr    */
+/* #define PARIX       */
+/* #define POSIX       */
+/* #define WIN32       */
+/* #define POSIX1      */
+/***********************/
+
+#include <stdio.h>
+#include <math.h>
+			    /* 'Uncomment' the line below to run   */
+			    /* with 'register double' variables    */
+			    /* defined, or compile with the        */
+			    /* '-DROPT' option. Don't need this if */
+			    /* registers used automatically, but   */
+			    /* you might want to try it anyway.    */
+/* #define ROPT */
+#include <sys/time.h>
+#include <time.h>
+
+#ifdef __APPLE__
+#include <mach/mach_time.h>
+double dtime(double p[])
+{
+    static double timebase = 0.0;
+    static uint64_t timestart = 0;
+    uint64_t now;
+    double q;
+
+    if (timebase == 0.0) {
+        mach_timebase_info_data_t tb;
+        mach_timebase_info(&tb);
+        timebase = tb.numer / (1e9 * tb.denom);
+        timestart = mach_absolute_time();
+        p[2] = 0.0;
+        p[1] = 0.0;
+        return 0.0;
+    }
+
+    q = p[2];
+    now = mach_absolute_time();
+    p[2] = (now - timestart) * timebase; // seconds
+    p[1] = p[2] - q;
+    return 0.0;
+}
+#else
+#include <time.h>
+double dtime(double p[])
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    double now = ts.tv_sec + ts.tv_nsec * 1e-9;
+    double q = p[2];
+    p[2] = now;
+    p[1] = p[2] - q;
+    return 0.0;
+}
+#endif
+
+double nulltime, TimeArray[3];   /* Variables needed for 'dtime()'.     */
+double TLimit;                   /* Threshold to determine Number of    */
+				 /* Loops to run. Fixed at 15.0 seconds.*/
+
+double T[36];                    /* Global Array used to hold timing    */
+				 /* results and other information.      */
+
+double sa,sb,sc,sd,one,two,three;
+double four,five,piref,piprg;
+double scale,pierr;
+
+double A0 = 1.0;
+double A1 = -0.1666666666671334;
+double A2 = 0.833333333809067E-2;
+double A3 = 0.198412715551283E-3;
+double A4 = 0.27557589750762E-5;
+double A5 = 0.2507059876207E-7;
+double A6 = 0.164105986683E-9;
+
+double B0 = 1.0;
+double B1 = -0.4999999999982;
+double B2 = 0.4166666664651E-1;
+double B3 = -0.1388888805755E-2;
+double B4 = 0.24801428034E-4;
+double B5 = -0.2754213324E-6;
+double B6 = 0.20189405E-8;
+
+double C0 = 1.0;
+double C1 = 0.99999999668;
+double C2 = 0.49999995173;
+double C3 = 0.16666704243;
+double C4 = 0.4166685027E-1;
+double C5 = 0.832672635E-2;
+double C6 = 0.140836136E-2;
+double C7 = 0.17358267E-3;
+double C8 = 0.3931683E-4;
+
+double D1 = 0.3999999946405E-1;
+double D2 = 0.96E-3;
+double D3 = 0.1233153E-5;
+
+double E2 = 0.48E-3;
+double E3 = 0.411051E-6;
+
+// void dtime(double p[]);
+
+int main()
+{
+
+#ifdef ROPT
+   register double s,u,v,w,x;
+#else
+   double s,u,v,w,x;
+#endif
+
+   long loops, NLimit;
+   register long i, m, n;
+
+   printf("\n");
+   printf("   FLOPS C Program (Double Precision), V2.0 18 Dec 1992\n\n");
+
+			/****************************/
+   loops = 15625;        /* Initial number of loops. */
+			/*     DO NOT CHANGE!       */
+			/****************************/
+
+/****************************************************/
+/* Set Variable Values.                             */
+/* T[1] references all timing results relative to   */
+/* one million loops.                               */
+/*                                                  */
+/* The program will execute from 31250 to 512000000 */
+/* loops based on a runtime of Module 1 of at least */
+/* TLimit = 15.0 seconds. That is, a runtime of 15  */
+/* seconds for Module 1 is used to determine the    */
+/* number of loops to execute.                      */
+/*                                                  */
+/* No more than NLimit = 512000000 loops are allowed*/
+/****************************************************/
+
+   T[1] = 1.0E+06/(double)loops;
+
+   TLimit = 0.5; // CHANGED! IT WAS 15.0
+   NLimit = 512000000;
+
+   piref = 3.14159265358979324;
+   one   = 1.0;
+   two   = 2.0;
+   three = 3.0;
+   four  = 4.0;
+   five  = 5.0;
+   scale = one;
+
+   printf("   Module     Error        RunTime      MFLOPS\n");
+   printf("                            (usec)\n");
+/*************************/
+/* Initialize the timer. */
+/*************************/
+
+   dtime(TimeArray);
+   dtime(TimeArray);
+
+/*******************************************************/
+/* Module 1.  Calculate integral of df(x)/f(x) defined */
+/*            below.  Result is ln(f(1)). There are 14 */
+/*            double precision operations per loop     */
+/*            ( 7 +, 0 -, 6 *, 1 / ) that are included */
+/*            in the timing.                           */
+/*            50.0% +, 00.0% -, 42.9% *, and 07.1% /   */
+/*******************************************************/
+   n = loops;
+   sa = 0.0;
+
+   while ( sa < TLimit )
+   {
+   n = 2 * n;
+   x = one / (double)n;                            /*********************/
+   s = 0.0;                                        /*  Loop 1.          */
+   v = 0.0;                                        /*********************/
+   w = one;
+
+       dtime(TimeArray);
+       for( i = 1 ; i <= n-1 ; i++ )
+       {
+       v = v + w;
+       u = v * x;
+       s = s + (D1+u*(D2+u*D3))/(w+u*(D1+u*(E2+u*E3)));
+       }
+       dtime(TimeArray);
+       sa = TimeArray[1];
+
+   if ( n == NLimit ) break;
+   /* printf(" %10ld  %12.5lf\n",n,sa); */
+   }
+
+   scale = 1.0E+06 / (double)n;
+   T[1]  = scale;
+
+/****************************************/
+/* Estimate nulltime ('for' loop time). */
+/****************************************/
+   dtime(TimeArray);
+   for( i = 1 ; i <= n-1 ; i++ )
+   {
+   }
+   dtime(TimeArray);
+   nulltime = T[1] * TimeArray[1];
+   if ( nulltime < 0.0 ) nulltime = 0.0;
+
+   T[2] = T[1] * sa - nulltime;
+
+   sa = (D1+D2+D3)/(one+D1+E2+E3);
+   sb = D1;
+
+   T[3] = T[2] / 14.0;                             /*********************/
+   sa = x * ( sa + sb + two * s ) / two;           /* Module 1 Results. */
+   sb = one / sa;                                  /*********************/
+   n  = (long)( (double)( 40000 * (long)sb ) / scale );
+   sc = sb - 25.2;
+   T[4] = one / T[3];
+						   /********************/
+						   /*  DO NOT REMOVE   */
+						   /*  THIS PRINTOUT!  */
+						   /********************/
+   printf("     1   %13.4le  %10.4lf  %10.4lf\n",sc,T[2],T[4]);
+
+   m = n;
+
+/*******************************************************/
+/* Module 2.  Calculate value of PI from Taylor Series */
+/*            expansion of atan(1.0).  There are 7     */
+/*            double precision operations per loop     */
+/*            ( 3 +, 2 -, 1 *, 1 / ) that are included */
+/*            in the timing.                           */
+/*            42.9% +, 28.6% -, 14.3% *, and 14.3% /   */
+/*******************************************************/
+
+   s  = -five;                                      /********************/
+   sa = -one;                                       /* Loop 2.          */
+						   /********************/
+   dtime(TimeArray);
+   for ( i = 1 ; i <= m ; i++ )
+   {
+   s  = -s;
+   sa = sa + s;
+   }
+   dtime(TimeArray);
+   T[5] = T[1] * TimeArray[1];
+   if ( T[5] < 0.0 ) T[5] = 0.0;
+
+   sc   = (double)m;
+
+   u = sa;                                         /*********************/
+   v = 0.0;                                        /* Loop 3.           */
+   w = 0.0;                                        /*********************/
+   x = 0.0;
+
+   dtime(TimeArray);
+   for ( i = 1 ; i <= m ; i++)
+   {
+   s  = -s;
+   sa = sa + s;
+   u  = u + two;
+   x  = x +(s - u);
+   v  = v - s * u;
+   w  = w + s / u;
+   }
+   dtime(TimeArray);
+   T[6] = T[1] * TimeArray[1];
+
+   T[7] = ( T[6] - T[5] ) / 7.0;                   /*********************/
+   m  = (long)( sa * x  / sc );                    /*  PI Results       */
+   sa = four * w / five;                           /*********************/
+   sb = sa + five / v;
+   sc = 31.25;
+   piprg = sb - sc / (v * v * v);
+   pierr = piprg - piref;
+   T[8]  = one  / T[7];
+						  /*********************/
+						  /*   DO NOT REMOVE   */
+						  /*   THIS PRINTOUT!  */
+						  /*********************/
+   printf("     2   %13.4le  %10.4lf  %10.4lf\n",pierr,T[6]-T[5],T[8]);
+
+/*******************************************************/
+/* Module 3.  Calculate integral of sin(x) from 0.0 to */
+/*            PI/3.0 using Trapazoidal Method. Result  */
+/*            is 0.5. There are 17 double precision    */
+/*            operations per loop (6 +, 2 -, 9 *, 0 /) */
+/*            included in the timing.                  */
+/*            35.3% +, 11.8% -, 52.9% *, and 00.0% /   */
+/*******************************************************/
+
+   x = piref / ( three * (double)m );              /*********************/
+   s = 0.0;                                        /*  Loop 4.          */
+   v = 0.0;                                        /*********************/
+
+   dtime(TimeArray);
+   for( i = 1 ; i <= m-1 ; i++ )
+   {
+   v = v + one;
+   u = v * x;
+   w = u * u;
+   s = s + u * ((((((A6*w-A5)*w+A4)*w-A3)*w+A2)*w+A1)*w+one);
+   }
+   dtime(TimeArray);
+   T[9]  = T[1] * TimeArray[1] - nulltime;
+
+   u  = piref / three;
+   w  = u * u;
+   sa = u * ((((((A6*w-A5)*w+A4)*w-A3)*w+A2)*w+A1)*w+one);
+
+   T[10] = T[9] / 17.0;                            /*********************/
+   sa = x * ( sa + two * s ) / two;                /* sin(x) Results.   */
+   sb = 0.5;                                       /*********************/
+   sc = sa - sb;
+   T[11] = one / T[10];
+						  /*********************/
+						  /*   DO NOT REMOVE   */
+						  /*   THIS PRINTOUT!  */
+						  /*********************/
+   printf("     3   %13.4le  %10.4lf  %10.4lf\n",sc,T[9],T[11]);
+
+/************************************************************/
+/* Module 4.  Calculate Integral of cos(x) from 0.0 to PI/3 */
+/*            using the Trapazoidal Method. Result is       */
+/*            sin(PI/3). There are 15 double precision      */
+/*            operations per loop (7 +, 0 -, 8 *, and 0 / ) */
+/*            included in the timing.                       */
+/*            50.0% +, 00.0% -, 50.0% *, 00.0% /            */
+/************************************************************/
+   A3 = -A3;
+   A5 = -A5;
+   x = piref / ( three * (double)m );              /*********************/
+   s = 0.0;                                        /*  Loop 5.          */
+   v = 0.0;                                        /*********************/
+
+   dtime(TimeArray);
+   for( i = 1 ; i <= m-1 ; i++ )
+   {
+   u = (double)i * x;
+   w = u * u;
+   s = s + w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one;
+   }
+   dtime(TimeArray);
+   T[12]  = T[1] * TimeArray[1] - nulltime;
+
+   u  = piref / three;
+   w  = u * u;
+   sa = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one;
+
+   T[13] = T[12] / 15.0;                             /*******************/
+   sa = x * ( sa + one + two * s ) / two;            /* Module 4 Result */
+   u  = piref / three;                               /*******************/
+   w  = u * u;
+   sb = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+A0);
+   sc = sa - sb;
+   T[14] = one / T[13];
+						  /*********************/
+						  /*   DO NOT REMOVE   */
+						  /*   THIS PRINTOUT!  */
+						  /*********************/
+   printf("     4   %13.4le  %10.4lf  %10.4lf\n",sc,T[12],T[14]);
+
+/************************************************************/
+/* Module 5.  Calculate Integral of tan(x) from 0.0 to PI/3 */
+/*            using the Trapazoidal Method. Result is       */
+/*            ln(cos(PI/3)). There are 29 double precision  */
+/*            operations per loop (13 +, 0 -, 15 *, and 1 /)*/
+/*            included in the timing.                       */
+/*            46.7% +, 00.0% -, 50.0% *, and 03.3% /        */
+/************************************************************/
+
+   x = piref / ( three * (double)m );              /*********************/
+   s = 0.0;                                        /*  Loop 6.          */
+   v = 0.0;                                        /*********************/
+
+   dtime(TimeArray);
+   for( i = 1 ; i <= m-1 ; i++ )
+   {
+   u = (double)i * x;
+   w = u * u;
+   v = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);
+   s = s + v / (w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one);
+   }
+   dtime(TimeArray);
+   T[15]  = T[1] * TimeArray[1] - nulltime;
+
+   u  = piref / three;
+   w  = u * u;
+   sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);
+   sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one;
+   sa = sa / sb;
+
+   T[16] = T[15] / 29.0;                             /*******************/
+   sa = x * ( sa + two * s ) / two;                  /* Module 5 Result */
+   sb = 0.6931471805599453;                          /*******************/
+   sc = sa - sb;
+   T[17] = one / T[16];
+						  /*********************/
+						  /*   DO NOT REMOVE   */
+						  /*   THIS PRINTOUT!  */
+						  /*********************/
+   printf("     5   %13.4le  %10.4lf  %10.4lf\n",sc,T[15],T[17]);
+
+/************************************************************/
+/* Module 6.  Calculate Integral of sin(x)*cos(x) from 0.0  */
+/*            to PI/4 using the Trapazoidal Method. Result  */
+/*            is sin(PI/4)^2. There are 29 double precision */
+/*            operations per loop (13 +, 0 -, 16 *, and 0 /)*/
+/*            included in the timing.                       */
+/*            46.7% +, 00.0% -, 53.3% *, and 00.0% /        */
+/************************************************************/
+
+   x = piref / ( four * (double)m );               /*********************/
+   s = 0.0;                                        /*  Loop 7.          */
+   v = 0.0;                                        /*********************/
+
+   dtime(TimeArray);
+   for( i = 1 ; i <= m-1 ; i++ )
+   {
+   u = (double)i * x;
+   w = u * u;
+   v = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);
+   s = s + v*(w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one);
+   }
+   dtime(TimeArray);
+   T[18]  = T[1] * TimeArray[1] - nulltime;
+
+   u  = piref / four;
+   w  = u * u;
+   sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);
+   sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one;
+   sa = sa * sb;
+
+   T[19] = T[18] / 29.0;                             /*******************/
+   sa = x * ( sa + two * s ) / two;                  /* Module 6 Result */
+   sb = 0.25;                                        /*******************/
+   sc = sa - sb;
+   T[20] = one / T[19];
+						  /*********************/
+						  /*   DO NOT REMOVE   */
+						  /*   THIS PRINTOUT!  */
+						  /*********************/
+   printf("     6   %13.4le  %10.4lf  %10.4lf\n",sc,T[18],T[20]);
+
+
+/*******************************************************/
+/* Module 7.  Calculate value of the definite integral */
+/*            from 0 to sa of 1/(x+1), x/(x*x+1), and  */
+/*            x*x/(x*x*x+1) using the Trapizoidal Rule.*/
+/*            There are 12 double precision operations */
+/*            per loop ( 3 +, 3 -, 3 *, and 3 / ) that */
+/*            are included in the timing.              */
+/*            25.0% +, 25.0% -, 25.0% *, and 25.0% /   */
+/*******************************************************/
+
+						  /*********************/
+   s = 0.0;                                        /* Loop 8.           */
+   w = one;                                        /*********************/
+   sa = 102.3321513995275;
+   v = sa / (double)m;
+
+   dtime(TimeArray);
+   for ( i = 1 ; i <= m-1 ; i++)
+   {
+   x = (double)i * v;
+   u = x * x;
+   s = s - w / ( x + w ) - x / ( u + w ) - u / ( x * u + w );
+   }
+   dtime(TimeArray);
+   T[21] = T[1] * TimeArray[1] - nulltime;
+						  /*********************/
+						  /* Module 7 Results  */
+						  /*********************/
+   T[22] = T[21] / 12.0;
+   x  = sa;
+   u  = x * x;
+   sa = -w - w / ( x + w ) - x / ( u + w ) - u / ( x * u + w );
+   sa = 18.0 * v * (sa + two * s );
+
+   m  = -2000 * (long)sa;
+   m = (long)( (double)m / scale );
+
+   sc = sa + 500.2;
+   T[23] = one / T[22];
+						  /********************/
+						  /*  DO NOT REMOVE   */
+						  /*  THIS PRINTOUT!  */
+						  /********************/
+   printf("     7   %13.4le  %10.4lf  %10.4lf\n",sc,T[21],T[23]);
+
+/************************************************************/
+/* Module 8.  Calculate Integral of sin(x)*cos(x)*cos(x)    */
+/*            from 0 to PI/3 using the Trapazoidal Method.  */
+/*            Result is (1-cos(PI/3)^3)/3. There are 30     */
+/*            double precision operations per loop included */
+/*            in the timing:                                */
+/*               13 +,     0 -,    17 *          0 /        */
+/*            46.7% +, 00.0% -, 53.3% *, and 00.0% /        */
+/************************************************************/
+
+   x = piref / ( three * (double)m );              /*********************/
+   s = 0.0;                                        /*  Loop 9.          */
+   v = 0.0;                                        /*********************/
+
+   dtime(TimeArray);
+   for( i = 1 ; i <= m-1 ; i++ )
+   {
+   u = (double)i * x;
+   w = u * u;
+   v = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one;
+   s = s + v*v*u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);
+   }
+   dtime(TimeArray);
+   T[24]  = T[1] * TimeArray[1] - nulltime;
+
+   u  = piref / three;
+   w  = u * u;
+   sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);
+   sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one;
+   sa = sa * sb * sb;
+
+   T[25] = T[24] / 30.0;                             /*******************/
+   sa = x * ( sa + two * s ) / two;                  /* Module 8 Result */
+   sb = 0.29166666666666667;                         /*******************/
+   sc = sa - sb;
+   T[26] = one / T[25];
+						  /*********************/
+						  /*   DO NOT REMOVE   */
+						  /*   THIS PRINTOUT!  */
+						  /*********************/
+   printf("     8   %13.4le  %10.4lf  %10.4lf\n",sc,T[24],T[26]);
+
+/**************************************************/
+/* MFLOPS(1) output. This is the same weighting   */
+/* used for all previous versions of the flops.c  */
+/* program. Includes Modules 2 and 3 only.        */
+/**************************************************/
+   T[27] = ( five * (T[6] - T[5]) + T[9] ) / 52.0;
+   T[28] = one  / T[27];
+
+/**************************************************/
+/* MFLOPS(2) output. This output does not include */
+/* Module 2, but it still does 9.2% FDIV's.       */
+/**************************************************/
+   T[29] = T[2] + T[9] + T[12] + T[15] + T[18];
+   T[29] = (T[29] + four * T[21]) / 152.0;
+   T[30] = one / T[29];
+
+/**************************************************/
+/* MFLOPS(3) output. This output does not include */
+/* Module 2, but it still does 3.4% FDIV's.       */
+/**************************************************/
+   T[31] = T[2] + T[9] + T[12] + T[15] + T[18];
+   T[31] = (T[31] + T[21] + T[24]) / 146.0;
+   T[32] = one / T[31];
+
+/**************************************************/
+/* MFLOPS(4) output. This output does not include */
+/* Module 2, and it does NO FDIV's.               */
+/**************************************************/
+   T[33] = (T[9] + T[12] + T[18] + T[24]) / 91.0;
+   T[34] = one / T[33];
+
+
+   printf("\n");
+   printf("   Iterations      = %10ld\n",m);
+   printf("   NullTime (usec) = %10.4lf\n",nulltime);
+   printf("   MFLOPS(1)       = %10.4lf\n",T[28]);
+   printf("   MFLOPS(2)       = %10.4lf\n",T[30]);
+   printf("   MFLOPS(3)       = %10.4lf\n",T[32]);
+   printf("   MFLOPS(4)       = %10.4lf\n\n",T[34]);
+   return 0;
+}
+
+/*****************************************************/
+/* Various timer routines.                           */
+/* Al Aburto, aburto@nosc.mil, 18 Feb 1997           */
+/*                                                   */
+/* dtime(p) outputs the elapsed time seconds in p[1] */
+/* from a call of dtime(p) to the next call of       */
+/* dtime(p).  Use CAUTION as some of these routines  */
+/* will mess up when timing across the hour mark!!!  */
+/*                                                   */
+/* For timing I use the 'user' time whenever         */
+/* possible. Using 'user+sys' time is a separate     */
+/* issue.                                            */
+/*                                                   */
+/* Example Usage:                                    */
+/* [Timer options added here]                        */
+/* double RunTime, TimeArray[3];                     */
+/* main()                                            */
+/* {                                                 */
+/* dtime(TimeArray);                                 */
+/* [routine to time]                                 */
+/* dtime(TimeArray);                                 */
+/* RunTime = TimeArray[1];                           */
+/* }                                                 */
+/* [Timer code added here]                           */
+/*****************************************************/
+
+/******************************/
+/* Timer code.                */
+/******************************/
+
+/*******************/
+/*  Amiga dtime()  */
+/*******************/
+#ifdef Amiga
+#include <ctype.h>
+#define HZ 50
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ struct tt {
+	long  days;
+	long  minutes;
+	long  ticks;
+ } tt;
+
+ q = p[2];
+
+ DateStamp(&tt);
+
+ p[2] = ( (double)(tt.ticks + (tt.minutes * 60L * 50L)) ) / (double)HZ;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/*****************************************************/
+/*  UNIX dtime(). This is the preferred UNIX timer.  */
+/*  Provided by: Markku Kolkka, mk59200@cc.tut.fi    */
+/*  HP-UX Addition by: Bo Thide', bt@irfu.se         */
+/*****************************************************/
+#ifdef UNIX
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifdef hpux
+#include <sys/syscall.h>
+#define getrusage(a,b) syscall(SYS_getrusage,a,b)
+#endif
+
+struct rusage rusage;
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+
+ getrusage(RUSAGE_SELF,&rusage);
+
+ p[2] = (double)(rusage.ru_utime.tv_sec);
+ p[2] = p[2] + (double)(rusage.ru_utime.tv_usec) * 1.0e-06;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/***************************************************/
+/*  UNIX_Old dtime(). This is the old UNIX timer.  */
+/*  Use only if absolutely necessary as HZ may be  */
+/*  ill defined on your system.                    */
+/***************************************************/
+#ifdef UNIX_Old
+#include <sys/types.h>
+#include <sys/times.h>
+#include <sys/param.h>
+
+#ifndef HZ
+#define HZ 60
+#endif
+
+struct tms tms;
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+
+ times(&tms);
+
+ p[2] = (double)(tms.tms_utime) / (double)HZ;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/*********************************************************/
+/*  VMS dtime() for VMS systems.                         */
+/*  Provided by: RAMO@uvphys.phys.UVic.CA                */
+/*  Some people have run into problems with this timer.  */
+/*********************************************************/
+#ifdef VMS
+#include time
+
+#ifndef HZ
+#define HZ 100
+#endif
+
+struct tbuffer_t
+      {
+       int proc_user_time;
+       int proc_system_time;
+       int child_user_time;
+       int child_system_time;
+      };
+
+struct tbuffer_t tms;
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+
+ times(&tms);
+
+ p[2] = (double)(tms.proc_user_time) / (double)HZ;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/******************************/
+/*  BORLAND C dtime() for DOS */
+/******************************/
+#ifdef BORLAND_C
+#include <ctype.h>
+#include <dos.h>
+#include <time.h>
+
+#define HZ 100
+struct time tnow;
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+
+ gettime(&tnow);
+
+ p[2] = 60.0 * (double)(tnow.ti_min);
+ p[2] = p[2] + (double)(tnow.ti_sec);
+ p[2] = p[2] + (double)(tnow.ti_hund)/(double)HZ;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/**************************************/
+/*  Microsoft C (MSC) dtime() for DOS */
+/**************************************/
+#ifdef MSC
+#include <time.h>
+#include <ctype.h>
+
+#define HZ CLOCKS_PER_SEC
+clock_t tnow;
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+
+ tnow = clock();
+
+ p[2] = (double)tnow / (double)HZ;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/*************************************/
+/*  Macintosh (MAC) Think C dtime()  */
+/*************************************/
+#ifdef MAC
+#include <time.h>
+
+#define HZ 60
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+
+ p[2] = (double)clock() / (double)HZ;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/************************************************************/
+/*  iPSC/860 (IPSC) dtime() for i860.                       */
+/*  Provided by: Dan Yergeau, yergeau@gloworm.Stanford.EDU  */
+/************************************************************/
+#ifdef IPSC
+extern double dclock();
+
+dtime(p)
+double p[];
+{
+  double q;
+
+  q = p[2];
+
+  p[2] = dclock();
+  p[1] = p[2] - q;
+
+  return 0;
+}
+#endif
+
+/**************************************************/
+/*  FORTRAN dtime() for Cray type systems.        */
+/*  This is the preferred timer for Cray systems. */
+/**************************************************/
+#ifdef FORTRAN_SEC
+
+fortran double second();
+
+dtime(p)
+double p[];
+{
+ double q,v;
+
+ q = p[2];
+
+ second(&v);
+ p[2] = v;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/***********************************************************/
+/*  UNICOS C dtime() for Cray UNICOS systems.  Don't use   */
+/*  unless absolutely necessary as returned time includes  */
+/*  'user+system' time.  Provided by: R. Mike Dority,      */
+/*  dority@craysea.cray.com                                */
+/***********************************************************/
+#ifdef CTimer
+#include <time.h>
+
+dtime(p)
+double p[];
+{
+ double    q;
+ clock_t   clock(void);
+
+ q = p[2];
+
+ p[2] = (double)clock() / (double)CLOCKS_PER_SEC;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/********************************************/
+/* Another UNIX timer using gettimeofday(). */
+/* However, getrusage() is preferred.       */
+/********************************************/
+#ifdef GTODay
+#include <sys/time.h>
+
+struct timeval tnow;
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+
+ gettimeofday(&tnow,NULL);
+ p[2] = (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/*****************************************************/
+/*  Fujitsu UXP/M timer.                             */
+/*  Provided by: Mathew Lim, ANUSF, M.Lim@anu.edu.au */
+/*****************************************************/
+#ifdef UXPM
+#include <sys/types.h>
+#include <sys/timesu.h>
+struct tmsu rusage;
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+
+ timesu(&rusage);
+
+ p[2] = (double)(rusage.tms_utime) * 1.0e-06;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/**********************************************/
+/*    Macintosh (MAC_TMgr) Think C dtime()    */
+/*   requires Think C Language Extensions or  */
+/*    #include <MacHeaders> in the prefix     */
+/*  provided by Francis H Schiffer 3rd (fhs)  */
+/*         skipschiffer@genie.geis.com        */
+/**********************************************/
+#ifdef MAC_TMgr
+#include <Time.h>
+#include <stdlib.h>
+
+static TMTask   mgrTimer;
+static Boolean  mgrInited = FALSE;
+static double   mgrClock;
+
+#define RMV_TIMER RmvTime( (QElemPtr)&mgrTimer )
+#define MAX_TIME  1800000000L
+/* MAX_TIME limits time between calls to */
+/* dtime( ) to no more than 30 minutes   */
+/* this limitation could be removed by   */
+/* creating a completion routine to sum  */
+/* 30 minute segments (fhs 1994 feb 9)   */
+
+static void     Remove_timer( )
+{
+ RMV_TIMER;
+ mgrInited = FALSE;
+}
+
+int     dtime( p )
+double p[];
+{
+ if ( mgrInited ) {
+    RMV_TIMER;
+    mgrClock += (MAX_TIME + mgrTimer.tmCount)*1.0e-6;
+    } else {
+    if ( _atexit( &Remove_timer ) == 0 ) mgrInited = TRUE;
+    mgrClock = 0.0;
+   }
+
+ p[1] = mgrClock - p[2];
+ p[2] = mgrClock;
+ if ( mgrInited ) {
+    mgrTimer.tmAddr = NULL;
+    mgrTimer.tmCount = 0;
+    mgrTimer.tmWakeUp = 0;
+    mgrTimer.tmReserved = 0;
+    InsTime( (QElemPtr)&mgrTimer );
+    PrimeTime( (QElemPtr)&mgrTimer, -MAX_TIME );
+   }
+ return( 0 );
+}
+#endif
+
+/***********************************************************/
+/*  Parsytec GCel timer.                                   */
+/*  Provided by: Georg Wambach, gw@informatik.uni-koeln.de */
+/***********************************************************/
+#ifdef PARIX
+#include <sys/time.h>
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+ p[2] = (double) (TimeNowHigh()) / (double) CLK_TCK_HIGH;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/************************************************/
+/*  Sun Solaris POSIX dtime() routine           */
+/*  Provided by: Case Larsen, CTLarsen@lbl.gov  */
+/************************************************/
+#ifdef POSIX
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/rusage.h>
+
+#ifdef __hpux
+#include <sys/syscall.h>
+#define getrusage(a,b) syscall(SYS_getrusage,a,b)
+#endif
+
+struct rusage rusage;
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+
+ getrusage(RUSAGE_SELF,&rusage);
+
+ p[2] = (double)(rusage.ru_utime.tv_sec);
+ p[2] = p[2] + (double)(rusage.ru_utime.tv_nsec) * 1.0e-09;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/****************************************************/
+/*  Windows NT (32 bit) dtime() routine             */
+/*  Provided by: Piers Haken, piersh@microsoft.com  */
+/****************************************************/
+#ifdef WIN32
+#include <windows.h>
+
+dtime(p)
+double p[];
+{
+ double q;
+
+ q = p[2];
+
+ p[2] = (double)GetTickCount() * 1.0e-03;
+ p[1] = p[2] - q;
+
+ return 0;
+}
+#endif
+
+/*****************************************************/
+/* Time according to POSIX.1  -  <J.Pelan@qub.ac.uk> */
+/* Ref: "POSIX Programmer's Guide"  O'Reilly & Assoc.*/
+/*****************************************************/
+#ifdef POSIX1
+#define _POSIX_SOURCE 1
+#include <unistd.h>
+#include <limits.h>
+#include <sys/times.h>
+
+struct tms tms;
+
+dtime(p)
+double p[];
+{
+ double q;
+ times(&tms);
+ q = p[2];
+ p[2] = (double)tms.tms_utime / (double)CLK_TCK;
+ p[1] = p[2] - q;
+ return 0;
+}
+#endif
+
+/*------ End flops.c code, say good night Jan! (Sep 1992) ------*/
diff --git a/Sheet3/stream.c b/Sheet3/stream.c
new file mode 100644
index 0000000..b9a2cee
--- /dev/null
+++ b/Sheet3/stream.c
@@ -0,0 +1,585 @@
+/*-----------------------------------------------------------------------*/
+/* Program: STREAM                                                       */
+/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
+/* Original code developed by John D. McCalpin                           */
+/* Programmers: John D. McCalpin                                         */
+/*              Joe R. Zagar                                             */
+/*                                                                       */
+/* This program measures memory transfer rates in MB/s for simple        */
+/* computational kernels coded in C.                                     */
+/*-----------------------------------------------------------------------*/
+/* Copyright 1991-2013: John D. McCalpin                                 */
+/*-----------------------------------------------------------------------*/
+/* License:                                                              */
+/*  1. You are free to use this program and/or to redistribute           */
+/*     this program.                                                     */
+/*  2. You are free to modify this program for your own use,             */
+/*     including commercial use, subject to the publication              */
+/*     restrictions in item 3.                                           */
+/*  3. You are free to publish results obtained from running this        */
+/*     program, or from works that you derive from this program,         */
+/*     with the following limitations:                                   */
+/*     3a. In order to be referred to as "STREAM benchmark results",     */
+/*         published results must be in conformance to the STREAM        */
+/*         Run Rules, (briefly reviewed below) published at              */
+/*         http://www.cs.virginia.edu/stream/ref.html                    */
+/*         and incorporated herein by reference.                         */
+/*         As the copyright holder, John McCalpin retains the            */
+/*         right to determine conformity with the Run Rules.             */
+/*     3b. Results based on modified source code or on runs not in       */
+/*         accordance with the STREAM Run Rules must be clearly          */
+/*         labelled whenever they are published.  Examples of            */
+/*         proper labelling include:                                     */
+/*           "tuned STREAM benchmark results"                            */
+/*           "based on a variant of the STREAM benchmark code"           */
+/*         Other comparable, clear, and reasonable labelling is          */
+/*         acceptable.                                                   */
+/*     3c. Submission of results to the STREAM benchmark web site        */
+/*         is encouraged, but not required.                              */
+/*  4. Use of this program or creation of derived works based on this    */
+/*     program constitutes acceptance of these licensing restrictions.   */
+/*  5. Absolutely no warranty is expressed or implied.                   */
+/*-----------------------------------------------------------------------*/
+# include <stdio.h>
+# include <unistd.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <sys/time.h>
+
+/*-----------------------------------------------------------------------
+ * INSTRUCTIONS:
+ *
+ *	1) STREAM requires different amounts of memory to run on different
+ *           systems, depending on both the system cache size(s) and the
+ *           granularity of the system timer.
+ *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
+ *           to meet *both* of the following criteria:
+ *       (a) Each array must be at least 4 times the size of the
+ *           available cache memory. I don't worry about the difference
+ *           between 10^6 and 2^20, so in practice the minimum array size
+ *           is about 3.8 times the cache size.
+ *           Example 1: One Xeon E3 with 8 MB L3 cache
+ *               STREAM_ARRAY_SIZE should be >= 4 million, giving
+ *               an array size of 30.5 MB and a total memory requirement
+ *               of 91.5 MB.  
+ *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
+ *               STREAM_ARRAY_SIZE should be >= 20 million, giving
+ *               an array size of 153 MB and a total memory requirement
+ *               of 458 MB.  
+ *       (b) The size should be large enough so that the 'timing calibration'
+ *           output by the program is at least 20 clock-ticks.  
+ *           Example: most versions of Windows have a 10 millisecond timer
+ *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
+ *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
+ *               This means the each array must be at least 1 GB, or 128M elements.
+ *
+ *      Version 5.10 increases the default array size from 2 million
+ *          elements to 10 million elements in response to the increasing
+ *          size of L3 caches.  The new default size is large enough for caches
+ *          up to 20 MB. 
+ *      Version 5.10 changes the loop index variables from "register int"
+ *          to "ssize_t", which allows array indices >2^32 (4 billion)
+ *          on properly configured 64-bit systems.  Additional compiler options
+ *          (such as "-mcmodel=medium") may be required for large memory runs.
+ *
+ *      Array size can be set at compile time without modifying the source
+ *          code for the (many) compilers that support preprocessor definitions
+ *          on the compile line.  E.g.,
+ *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
+ *          will override the default size of 10M with a new size of 100M elements
+ *          per array.
+ */
+#ifndef STREAM_ARRAY_SIZE
+#   define STREAM_ARRAY_SIZE	10000000
+#endif
+
+/*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
+ *         for any iteration after the first, therefore the minimum value
+ *         for NTIMES is 2.
+ *      There are no rules on maximum allowable values for NTIMES, but
+ *         values larger than the default are unlikely to noticeably
+ *         increase the reported performance.
+ *      NTIMES can also be set on the compile line without changing the source
+ *         code using, for example, "-DNTIMES=7".
+ */
+#ifdef NTIMES
+#if NTIMES<=1
+#   define NTIMES	10
+#endif
+#endif
+#ifndef NTIMES
+#   define NTIMES	10
+#endif
+
+/*  Users are allowed to modify the "OFFSET" variable, which *may* change the
+ *         relative alignment of the arrays (though compilers may change the 
+ *         effective offset by making the arrays non-contiguous on some systems). 
+ *      Use of non-zero values for OFFSET can be especially helpful if the
+ *         STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
+ *      OFFSET can also be set on the compile line without changing the source
+ *         code using, for example, "-DOFFSET=56".
+ */
+#ifndef OFFSET
+#   define OFFSET	0
+#endif
+
+/*
+ *	3) Compile the code with optimization.  Many compilers generate
+ *       unreasonably bad code before the optimizer tightens things up.  
+ *     If the results are unreasonably good, on the other hand, the
+ *       optimizer might be too smart for me!
+ *
+ *     For a simple single-core version, try compiling with:
+ *            cc -O stream.c -o stream
+ *     This is known to work on many, many systems....
+ *
+ *     To use multiple cores, you need to tell the compiler to obey the OpenMP
+ *       directives in the code.  This varies by compiler, but a common example is
+ *            gcc -O -fopenmp stream.c -o stream_omp
+ *       The environment variable OMP_NUM_THREADS allows runtime control of the 
+ *         number of threads/cores used when the resulting "stream_omp" program
+ *         is executed.
+ *
+ *     To run with single-precision variables and arithmetic, simply add
+ *         -DSTREAM_TYPE=float
+ *     to the compile line.
+ *     Note that this changes the minimum array sizes required --- see (1) above.
+ *
+ *     The preprocessor directive "TUNED" does not do much -- it simply causes the 
+ *       code to call separate functions to execute each kernel.  Trivial versions
+ *       of these functions are provided, but they are *not* tuned -- they just 
+ *       provide predefined interfaces to be replaced with tuned code.
+ *
+ *
+ *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include info that will help me understand:
+ *		a) the computer hardware configuration (e.g., processor model, memory type)
+ *		b) the compiler name/version and compilation flags
+ *      c) any run-time information (such as OMP_NUM_THREADS)
+ *		d) all of the output from the test case.
+ *
+ * Thanks!
+ *
+ *-----------------------------------------------------------------------*/
+
+# define HLINE "-------------------------------------------------------------\n"
+
+# ifndef MIN
+# define MIN(x,y) ((x)<(y)?(x):(y))
+# endif
+# ifndef MAX
+# define MAX(x,y) ((x)>(y)?(x):(y))
+# endif
+
+#ifndef STREAM_TYPE
+#define STREAM_TYPE double
+#endif
+
+static STREAM_TYPE	a[STREAM_ARRAY_SIZE+OFFSET],
+			b[STREAM_ARRAY_SIZE+OFFSET],
+			c[STREAM_ARRAY_SIZE+OFFSET];
+
+static double	avgtime[4] = {0}, maxtime[4] = {0},
+		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+
+static char	*label[4] = {"Copy:      ", "Scale:     ",
+    "Add:       ", "Triad:     "};
+
+static double	bytes[4] = {
+    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
+    };
+
+extern double mysecond();
+extern void checkSTREAMresults();
+#ifdef TUNED
+extern void tuned_STREAM_Copy();
+extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
+extern void tuned_STREAM_Add();
+extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
+#endif
+#ifdef _OPENMP
+extern int omp_get_num_threads();
+#endif
+int
+main()
+    {
+    int			quantum, checktick();
+    int			BytesPerWord;
+    int			k;
+    ssize_t		j;
+    STREAM_TYPE		scalar;
+    double		t, times[4][NTIMES];
+
+    /* --- SETUP --- determine precision and check timing --- */
+
+    printf(HLINE);
+    printf("STREAM version $Revision: 5.10 $\n");
+    printf(HLINE);
+    BytesPerWord = sizeof(STREAM_TYPE);
+    printf("This system uses %d bytes per array element.\n",
+	BytesPerWord);
+
+    printf(HLINE);
+#ifdef N
+    printf("*****  WARNING: ******\n");
+    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
+    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
+    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
+    printf("*****  WARNING: ******\n");
+#endif
+
+    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
+    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
+	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
+	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
+    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
+	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
+	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
+    printf("Each kernel will be executed %d times.\n", NTIMES);
+    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
+    printf(" will be used to compute the reported bandwidth.\n");
+
+#ifdef _OPENMP
+    printf(HLINE);
+#pragma omp parallel 
+    {
+#pragma omp master
+	{
+	    k = omp_get_num_threads();
+	    printf ("Number of Threads requested = %i\n",k);
+        }
+    }
+#endif
+
+#ifdef _OPENMP
+	k = 0;
+#pragma omp parallel
+#pragma omp atomic 
+		k++;
+    printf ("Number of Threads counted = %i\n",k);
+#endif
+
+    /* Get initial value for system clock. */
+#pragma omp parallel for
+    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+	    a[j] = 1.0;
+	    b[j] = 2.0;
+	    c[j] = 0.0;
+	}
+
+    printf(HLINE);
+
+    if  ( (quantum = checktick()) >= 1) 
+	printf("Your clock granularity/precision appears to be "
+	    "%d microseconds.\n", quantum);
+    else {
+	printf("Your clock granularity appears to be "
+	    "less than one microsecond.\n");
+	quantum = 1;
+    }
+
+    t = mysecond();
+#pragma omp parallel for
+    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
+		a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (mysecond() - t);
+
+    printf("Each test below will take on the order"
+	" of %d microseconds.\n", (int) t  );
+    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+    printf("Increase the size of the arrays if this shows that\n");
+    printf("you are not getting at least 20 clock ticks per test.\n");
+
+    printf(HLINE);
+
+    printf("WARNING -- The above is only a rough guideline.\n");
+    printf("For best results, please be sure you know the\n");
+    printf("precision of your system timer.\n");
+    printf(HLINE);
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+    scalar = 3.0;
+    for (k=0; k<NTIMES; k++)
+	{
+	times[0][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Copy();
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j];
+#endif
+	times[0][k] = mysecond() - times[0][k];
+	
+	times[1][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Scale(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    b[j] = scalar*c[j];
+#endif
+	times[1][k] = mysecond() - times[1][k];
+	
+	times[2][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Add();
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j]+b[j];
+#endif
+	times[2][k] = mysecond() - times[2][k];
+	
+	times[3][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Triad(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    a[j] = b[j]+scalar*c[j];
+#endif
+	times[3][k] = mysecond() - times[3][k];
+	}
+
+    /*	--- SUMMARY --- */
+
+    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
+	{
+	for (j=0; j<4; j++)
+	    {
+	    avgtime[j] = avgtime[j] + times[j][k];
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	    }
+	}
+    
+    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+		avgtime[j] = avgtime[j]/(double)(NTIMES-1);
+
+		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
+	       1.0E-06 * bytes[j]/mintime[j],
+	       avgtime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    printf(HLINE);
+
+    /* --- Check Results --- */
+    checkSTREAMresults();
+    printf(HLINE);
+
+    return 0;
+}
+
+# define	M	20
+
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+
+    for (i = 0; i < M; i++) {
+	t1 = mysecond();
+	while( ((t2=mysecond()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+   return(minDelta);
+    }
+
+
+
+/* A gettimeofday routine to give access to the wall
+   clock timer on most UNIX-like systems.  */
+
+#include <sys/time.h>
+
+double mysecond()
+{
+        struct timeval tp;
+        struct timezone tzp;
+        int i;
+
+        i = gettimeofday(&tp,&tzp);
+        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+#ifndef abs
+#define abs(a) ((a) >= 0 ? (a) : -(a))
+#endif
+void checkSTREAMresults ()
+{
+	STREAM_TYPE aj,bj,cj,scalar;
+	STREAM_TYPE aSumErr,bSumErr,cSumErr;
+	STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
+	double epsilon;
+	ssize_t	j;
+	int	k,ierr,err;
+
+    /* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+    /* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+    /* now execute timing loop */
+	scalar = 3.0;
+	for (k=0; k<NTIMES; k++)
+        {
+            cj = aj;
+            bj = scalar*cj;
+            cj = aj+bj;
+            aj = bj+scalar*cj;
+        }
+
+    /* accumulate deltas between observed and expected results */
+	aSumErr = 0.0;
+	bSumErr = 0.0;
+	cSumErr = 0.0;
+	for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+		aSumErr += abs(a[j] - aj);
+		bSumErr += abs(b[j] - bj);
+		cSumErr += abs(c[j] - cj);
+		// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj);	// MCCALPIN
+	}
+	aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+	bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+	cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+
+	if (sizeof(STREAM_TYPE) == 4) {
+		epsilon = 1.e-6;
+	}
+	else if (sizeof(STREAM_TYPE) == 8) {
+		epsilon = 1.e-13;
+	}
+	else {
+		printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
+		epsilon = 1.e-6;
+	}
+
+	err = 0;
+	if (abs(aAvgErr/aj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(a[j]/aj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,aj,a[j],abs((aj-a[j])/aAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array a[], %d errors were found.\n",ierr);
+	}
+	if (abs(bAvgErr/bj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(b[j]/bj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,bj,b[j],abs((bj-b[j])/bAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array b[], %d errors were found.\n",ierr);
+	}
+	if (abs(cAvgErr/cj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(c[j]/cj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,cj,c[j],abs((cj-c[j])/cAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array c[], %d errors were found.\n",ierr);
+	}
+	if (err == 0) {
+		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
+	}
+#ifdef VERBOSE
+	printf ("Results Validation Verbose Results: \n");
+	printf ("    Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
+	printf ("    Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
+	printf ("    Rel Errors on a, b, c:     %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
+#endif
+}
+
+#ifdef TUNED
+/* stubs for "tuned" versions of the kernels */
+void tuned_STREAM_Copy()
+{
+	ssize_t j;
+#pragma omp parallel for
+        for (j=0; j<STREAM_ARRAY_SIZE; j++)
+            c[j] = a[j];
+}
+
+void tuned_STREAM_Scale(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    b[j] = scalar*c[j];
+}
+
+void tuned_STREAM_Add()
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j]+b[j];
+}
+
+void tuned_STREAM_Triad(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    a[j] = b[j]+scalar*c[j];
+}
+/* end of stubs for the "tuned" versions of the kernels */
+#endif