copied the templates

2026-01-19 10:45:46 +01:00 · 2026-01-19 10:45:46 +01:00 · 46636593b5
commit 46636593b5
63 changed files with 570970 additions and 0 deletions
--- a/mgrid_2/CMakeLists.txt
+++ b/mgrid_2/CMakeLists.txt
@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 3.21)
+project(mgrid_2)
+
+set(CMAKE_CXX_STANDARD 17)
+add_compile_options(-fopenmp)
+add_link_options(-fopenmp)
+
+#Can manually add the sources using the set command as follows:
+#set(SOURCES src/mainapp.cpp src/Student.cpp)
+
+#However, the file(GLOB...) allows for wildcard additions:
+file(GLOB SOURCES "*.cpp")
+add_executable(mgrid_2  ${SOURCES})
--- a/mgrid_2/Doxyfile
+++ b/mgrid_2/Doxyfile
--- a/mgrid_2/Makefile
+++ b/mgrid_2/Makefile
@ -0,0 +1,58 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# COMPILER=GCC_SEQ_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+MAIN = main
+#MAIN = generateCRS
+SOURCES = ${MAIN}.cpp binaryIO.cpp vdop.cpp geom.cpp\
+	getmatrix.cpp jacsolve.cpp userset.cpp\
+	cuthill_mckee_ordering.cpp
+# 	dexx.cpp debugd.cpp skalar.cpp  vecaccu.cpp accudiag.cpp
+
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= ${MAIN}.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+# -DNDEBUG
+# -pg slows down the code on my laptop when using CLANG_
+LINKFLAGS += -g 
+#-pg
+#CXXFLAGS += -Q --help=optimizers
+#CXXFLAGS += -fopt-info
+
+include ../${COMPILER}default.mk
+
+#############################################################################
+# additional specific cleaning in this directory
+clean_all::
+	@rm -f t.dat*
+
+
+#############################################################################
+# special testing
+# NPROCS	= 4
+#
+TFILE	= t.dat
+# TTMP	= t.tmp
+#
+graph: $(PROGRAM)
+# 	@rm -f $(TFILE).*
+	# next two lines only sequentially
+	./$(PROGRAM)
+	@mv  $(TFILE).000 $(TFILE)
+# 	$(MPIRUN) $(MPIFLAGS) -np $(NPROCS) $(PROGRAM)
+# 	@echo " "; echo "Manipulate data for graphics."; echo " "
+# 	@cat $(TFILE).* > $(TTMP)
+# 	@sort -b -k 2    $(TTMP)   -o $(TTMP).1
+# 	@sort -b -k 1    $(TTMP).1 -o $(TTMP).2
+# 	@awk  -f nl.awk  $(TTMP).2  > $(TFILE)
+# 	@rm -f $(TTMP).* $(TTMP) $(TFILE).*
+#
+	-gnuplot jac.dem
--- a/mgrid_2/README.md
+++ b/mgrid_2/README.md
@ -0,0 +1,24 @@
+# gmgrid: Geometric multigrid on CPU (OpenMP)
+
+### Status on 2025-May-14
+ - The multigrid part is deactivated via `*#undef MG` in *main.cpp* and so only a simple Jacobi iteration is performed as solver.
+ - `make run`
+ - or `main.GCC_ levels`  with number of `levels` in [0,7].
+
+### Data structures/implementation for GPU
+ - use data structures from cg_2:Framework for preconditioned solvers on GPU and CPU
+ - suggested: class vec from vdop_gpu.h
+ - suggested: class CRS_Matrix_GPU from crsmatrix_gpu.h
+ - use cuBLAS, cuSPARSE and more libraries whenever possible.
+ 
+### current code structure
+ - *main.cpp*
+ - *binaryIO.cpp*    : reads CRS matrix and vector from files
+ - *vdop.cpp*        : some basic vector operations on CPU, also in *utils.h*
+ - *geom.cpp*        : reads the coarse geometry, performes mesh handling and includes mesh hierarchy
+ - *getmatrix.cpp*   : compressed row storage matrix and its generation from a 2D mesh
+ - *cuthill_mckee_ordering.cpp* : graph reordering to minimize the bandwidth
+ - *jacsolve.cpp*    : Jacobi solver/smoother for a linear system of equations with CRS matrix; multigrid solver
+ - *elements.cpp*    : more general elements - experimental. Try with *el_main.cpp*
+ - *generateCRS.cpp* : Reads mesh files and generates files output with stiffness matrix (Laplace) and right hand side.
+
--- a/mgrid_2/ascii_read_meshvector.m
+++ b/mgrid_2/ascii_read_meshvector.m
@ -0,0 +1,43 @@
+function [ xc, ia, v ] = ascii_read_meshvector( fname )
+%
+% Loads the 2D triangular mesh (coordinates, vertex connectivity) 
+%  together with values on its vertices from an ASCII file.
+%   Matlab indexing is stored  (starts with 1).
+% 
+% The input file format is compatible 
+%  with Mesh_2d_3_matlab:Write_ascii_matlab(..) in jacobi_oo_stl/geom.h
+%
+%
+%  IN: fname - filename
+% OUT: xc    - coordinates
+%      ia    - mesh connectivity
+%      v     - solution vector
+
+DELIMETER = ' ';
+
+fprintf('Read file  %s\n',fname)
+
+% Read mesh constants
+nn = dlmread(fname,DELIMETER,[0 0 0 3]);  %% row_1, col_1, row_2, col_2  in C indexing!!!
+nnode = nn(1);
+ndim  = nn(2);
+nelem = nn(3);
+nvert = nn(4);
+
+% Read coordinates
+row_start = 0+1;
+row_end   = 0+nnode;
+xc = dlmread(fname,DELIMETER,[row_start 0 row_end ndim-1]);
+
+% Read connectivity
+row_start = row_end+1;
+row_end   = row_end+nelem;
+ia = dlmread(fname,DELIMETER,[row_start 0 row_end nvert-1]);
+
+% Read solution
+row_start = row_end+1;
+row_end   = row_end+nnode;
+v = dlmread(fname,DELIMETER,[row_start 0 row_end 0]);
+end
+
+
--- a/mgrid_2/binaryIO.cpp
+++ b/mgrid_2/binaryIO.cpp
@ -0,0 +1,115 @@
+#include "binaryIO.h"
+#include <cassert>
+
+using namespace std;
+
+void read_binMatrix(const string& file, vector<int> &cnt, vector<int> &col, vector<double> &ele)
+{
+    ifstream ifs(file, ios_base::in | ios_base::binary);
+
+    if(!(ifs.is_open() && ifs.good()))
+    {
+        cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
+        assert(ifs.is_open());
+    }
+    cout << "ReadBinMatrix: Opened file " << file << endl;
+
+    int _size;
+
+    ifs.read(reinterpret_cast<char*>(&_size), sizeof(int));   // old: ifs.read((char*)&_size, sizeof(int));
+    cnt.resize(_size);
+    cout << "ReadBinMatrix: cnt size: " << _size << endl;
+
+    ifs.read(reinterpret_cast<char*>(&_size), sizeof(int));
+    col.resize(_size);
+    cout << "ReadBinMatrix: col size: " << _size << endl;
+
+    ifs.read(reinterpret_cast<char*>(&_size), sizeof(int));
+    ele.resize(_size);
+    cout << "ReadBinMatrix: ele size: " << _size << endl;
+
+
+    ifs.read(reinterpret_cast<char*>(cnt.data()), cnt.size() * sizeof(int));
+    ifs.read(reinterpret_cast<char*>(col.data()), col.size() * sizeof(int));
+    ifs.read(reinterpret_cast<char*>(ele.data()), ele.size() * sizeof(double));
+
+    ifs.close();
+    cout << "ReadBinMatrix: Finished reading matrix.." << endl;
+}
+
+void write_binMatrix(const string& file, const vector<int> &cnt, const vector<int> &col, const vector<double> &ele)
+{
+    ofstream ofs(file, ios_base::out | ios_base::binary);
+
+
+    if(!(ofs.is_open() && ofs.good()))
+    {
+        cerr << "WriteBinMatrix: Error cannot open file " << file << endl;
+        assert(ofs.is_open());
+    }
+    cout << "WriteBinMatrix: Opened file " << file << endl;
+
+    int _size = static_cast<int>( cnt.size() );
+    cout << "WriteBinMatrix: cnt size: " << _size << endl;
+    ofs.write(reinterpret_cast<char*>(&_size), sizeof(int));
+    _size = static_cast<int>( col.size() );
+    cout << "WriteBinMatrix: col size: " << _size << endl;
+    ofs.write(reinterpret_cast<char*>(&_size), sizeof(int));
+    _size = static_cast<int>( ele.size() );
+    cout << "WriteBinMatrix: ele size: " << _size << endl;
+    ofs.write(reinterpret_cast<char*>(&_size), sizeof(int));
+
+    ofs.write(reinterpret_cast<char const *>(cnt.data()), cnt.size() * sizeof(int));
+    ofs.write(reinterpret_cast<char const *>(col.data()), col.size() * sizeof(int));
+    ofs.write(reinterpret_cast<char const *>(ele.data()), ele.size() * sizeof(double));
+
+    ofs.close();
+    cout << "WriteBinMatrix: Finished writing matrix.." << endl;
+}
+
+void read_binVector(const string& file, vector<double> &vec)
+{
+    ifstream ifs(file, ios_base::in | ios_base::binary);
+
+    if(!(ifs.is_open() && ifs.good()))
+    {
+        cerr << "ReadBinVector: Error cannot open file " << file << endl;
+        assert(ifs.is_open());
+    }
+    cout << "ReadBinVector: Opened file " << file << endl;
+
+    int _size;
+
+    ifs.read(reinterpret_cast<char*>(&_size), sizeof(int));
+    vec.resize(_size);
+    cout << "ReadBinVector: cnt size: " << _size << endl;
+
+    ifs.read(reinterpret_cast<char*>(vec.data()), _size * sizeof(double));
+
+    ifs.close();
+    cout << "ReadBinMatrix: Finished reading matrix.." << endl;	
+}
+
+void write_binVector(const string& file, const vector<double> &vec)
+{
+    ofstream ofs(file, ios_base::out | ios_base::binary);
+
+
+    if(!(ofs.is_open() && ofs.good()))
+    {
+        cerr << "WriteBinVector: Error cannot open file " << file << endl;
+        assert(ofs.is_open());
+    }
+    cout << "WriteBinVector: Opened file " << file << endl;
+
+    int _size = static_cast<int>( vec.size() );
+    cout << "WriteBinVector: size: " << _size << endl;
+    ofs.write(reinterpret_cast<char*>(&_size), sizeof(int));
+   
+    ofs.write(reinterpret_cast<char const *>(vec.data()), vec.size() * sizeof(double));
+
+    ofs.close();
+    cout << "WriteBinVector: Finished writing matrix.." << endl;
+}
+
+
--- a/mgrid_2/binaryIO.h
+++ b/mgrid_2/binaryIO.h
@ -0,0 +1,101 @@
+#pragma once
+
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <vector>
+
+/*
+  Matrices are in CRS format: The "dsp" (displacement) vector stores the starting index of each row. So for example
+  the third row starts at dsp[2] and ends at (dsp[3] - 1). The "col" vector stores the column indices of each non-zero
+  matrix entry. The "ele" vector stores the value af each non-zero matrix entry.
+  The redundant "cnt" vector stores the number of elements per row, and we have the relations:
+  cnt[k] := dsp[k+1]-dsp[k], resp.  dsp[k+1]:=cumsum(cnt[0:k]) with dsp[0]:=0
+*/
+
+
+//! \brief A sparse matrix in CRS format (counter, column index, value) is read from a binary file.
+//!
+//!        The binary file has to store 4 Byte integers and 8 Byte doubles and contains the following data:
+//!        - (int*4) Number of rows
+//!        - (int*4) Number of non-zero elements/blocks
+//!        - (int*4) Number of non-zero matrix elements (= previous number * dofs per block)
+//!        - (int*4) [#elements per row]
+//!        - (int*4) [column indices]
+//!        - (real*8)[matrix elements]
+//!
+//! \param[in]   file name of binary file
+//! \param[out]  cnt  number of non-zero elements per row
+//! \param[out]  col  column indices of non-zero elements,  C-Style
+//! \param[out]  ele  non-zero elements of matrix
+//!
+//!
+
+void read_binMatrix(const std::string& file, std::vector<int> &cnt, std::vector<int> &col, std::vector<double> &ele);
+
+//! \brief A sparse matrix in CRS format (counter, column index, value) is written to a binary file.
+//!        The memory is allocated dynamically.
+//!
+//!        The binary file has to store 4 Byte integers and 8 Byte doubles and contains the following data:
+//!        - (int*4) Number of rows
+//!        - (int*4) Number of non-zero elements/blocks
+//!        - (int*4) Number of non-zero matrix elements (= previous number * dofs per block)
+//!        - (int*4) [#elements per row]
+//!        - (int*4) [column indices]
+//!        - (real*8)[matrix elements]
+//!
+//! \param[in]  file name of binary file
+//! \param[in]  cnt  number of non-zero elements per row
+//! \param[in]  col  column indices of non-zero elements,  C-Style
+//! \param[in]  ele  non-zero elements of matrix
+//!
+//!
+void write_binMatrix(const std::string& file, const std::vector<int> &cnt, const std::vector<int> &col, const std::vector<double> &ele);
+
+//! \brief A double vector is read from a binary file.
+//!
+//!        The binary file has to store 4 Byte integers and 8 Byte doubles and contains the following data:
+//!        - (int*4) Number of vector elements
+//!        - (real*8)[vector elements]
+//!
+//! \param[in]  file name of binary file
+//! \param[out] vec  vector
+//!
+void read_binVector(const std::string& file, std::vector<double> &vec);
+
+//! \brief A double vector is written to a binary file. 
+//!        The memory is allocated dynamically.
+//!
+//!        The binary file has to store 4 Byte integers and 8 Byte doubles and contains the following data:
+//!        - (int*4) Number of vector elements
+//!        - (real*8)[vector elements]
+//!
+//! \param[in]  file name of binary file
+//! \param[in]  vec  vector
+//!
+void write_binVector(const std::string& file, const std::vector<double> &vec);
+
+/*
+//! \brief Output of an STL vector.
+//!
+//! \param[in,out] s    output stream
+//! \param[in]     rhs  STL vector
+//! \tparam        T    type of vector elements
+//!
+//! \return modified stream
+//!
+//!
+template <class T>
+std::ostream& operator<<(std::ostream &s, std::vector<T> const & rhs)
+{
+    copy (rhs.cbegin(), rhs.cend(), std::ostream_iterator<T>(s, " "));
+    return s;
+}
+*/
+
+// See item 2 in  http://codeforces.com/blog/entry/15643
+//! \brief Macro that prints a variable name together with its contest
+//!
+//! \param[in]     x  a variable
+#define what_is(x) cerr << #x << " is \n" << (x) << endl;
--- a/mgrid_2/cuthill_mckee_ordering.cpp
+++ b/mgrid_2/cuthill_mckee_ordering.cpp
@ -0,0 +1,218 @@
+//=======================================================================
+// Copyright 1997, 1998, 1999, 2000 University of Notre Dame.
+// Authors: Andrew Lumsdaine, Lie-Quan Lee, Jeremy G. Siek
+//
+// This file is part of the Boost Graph Library
+//
+// You should have received a copy of the License Agreement for the
+// Boost Graph Library along with the software; see the file LICENSE.
+// If not, contact Office of Research, University of Notre Dame, Notre
+// Dame, IN 46556.
+//
+// Permission to modify the code and to distribute modified code is
+// granted, provided the text of this NOTICE is retained, a notice that
+// the code was modified is included with the above COPYRIGHT NOTICE and
+// with the COPYRIGHT NOTICE in the LICENSE file, and that the LICENSE
+// file is distributed with the modified code.
+//
+// LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.
+// By way of example, but not limitation, Licensor MAKES NO
+// REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY
+// PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE COMPONENTS
+// OR DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS
+// OR OTHER RIGHTS.
+
+// Modyfied 2019 by Gundolf Haase, University of Graz
+//=======================================================================
+#include "cuthill_mckee_ordering.h"
+
+#include <boost/config.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/bandwidth.hpp>
+#include <boost/graph/cuthill_mckee_ordering.hpp>
+#include <boost/graph/properties.hpp>
+#include <iostream>
+#include <vector>
+
+/*
+  Sample Output
+  original bandwidth: 8
+  Reverse Cuthill-McKee ordering starting at: 6
+    8 3 0 9 2 5 1 4 7 6
+    bandwidth: 4
+  Reverse Cuthill-McKee ordering starting at: 0
+    9 1 4 6 7 2 8 5 3 0
+    bandwidth: 4
+  Reverse Cuthill-McKee ordering:
+    0 8 5 7 3 6 4 2 1 9
+    bandwidth: 4
+ */
+ /*
+int main(int, char *[])
+{
+    using namespace boost;
+    using namespace std;
+    typedef adjacency_list<vecS, vecS, undirectedS,
+            property<vertex_color_t, default_color_type,
+            property<vertex_degree_t, int> > > Graph;
+    typedef graph_traits<Graph>::vertex_descriptor Vertex;
+    typedef graph_traits<Graph>::vertices_size_type size_type;
+
+    typedef std::pair<std::size_t, std::size_t> Pair;
+    Pair edges[14] = { Pair(0, 3), //a-d
+                       Pair(0, 5), //a-f
+                       Pair(1, 2), //b-c
+                       Pair(1, 4), //b-e
+                       Pair(1, 6), //b-g
+                       Pair(1, 9), //b-j
+                       Pair(2, 3), //c-d
+                       Pair(2, 4), //c-e
+                       Pair(3, 5), //d-f
+                       Pair(3, 8), //d-i
+                       Pair(4, 6), //e-g
+                       Pair(5, 6), //f-g
+                       Pair(5, 7), //f-h
+                       Pair(6, 7)
+                     }; //g-h
+
+    Graph G(10);
+    for (int i = 0; i < 14; ++i)
+        add_edge(edges[i].first, edges[i].second, G);
+
+    graph_traits<Graph>::vertex_iterator ui, ui_end;
+
+    property_map<Graph, vertex_degree_t>::type deg = get(vertex_degree, G);
+    for (boost::tie(ui, ui_end) = vertices(G); ui != ui_end; ++ui)
+        deg[*ui] = degree(*ui, G);
+
+    property_map<Graph, vertex_index_t>::type
+    index_map = get(vertex_index, G);
+
+    std::cout << "original bandwidth: " << bandwidth(G) << std::endl;
+
+    std::vector<Vertex> inv_perm(num_vertices(G));
+    std::vector<size_type> perm(num_vertices(G));
+    {
+        Vertex s = vertex(6, G);
+        //reverse cuthill_mckee_ordering
+        cuthill_mckee_ordering(G, s, inv_perm.rbegin(), get(vertex_color, G),
+                               get(vertex_degree, G));
+        cout << "Reverse Cuthill-McKee ordering starting at: " << s << endl;
+        cout << "  ";
+        for (std::vector<Vertex>::const_iterator i = inv_perm.begin();
+                i != inv_perm.end(); ++i)
+            cout << index_map[*i] << " ";
+        cout << endl;
+
+        for (size_type c = 0; c != inv_perm.size(); ++c)
+            perm[index_map[inv_perm[c]]] = c;
+        std::cout << "  bandwidth: "
+                  << bandwidth(G, make_iterator_property_map(&perm[0], index_map, perm[0]))
+                  << std::endl;
+    }
+    {
+        Vertex s = vertex(0, G);
+        //reverse cuthill_mckee_ordering
+        cuthill_mckee_ordering(G, s, inv_perm.rbegin(), get(vertex_color, G),
+                               get(vertex_degree, G));
+        cout << "Reverse Cuthill-McKee ordering starting at: " << s << endl;
+        cout << "  ";
+        for (std::vector<Vertex>::const_iterator i = inv_perm.begin();
+                i != inv_perm.end(); ++i)
+            cout << index_map[*i] << " ";
+        cout << endl;
+
+        for (size_type c = 0; c != inv_perm.size(); ++c)
+            perm[index_map[inv_perm[c]]] = c;
+        std::cout << "  bandwidth: "
+                  << bandwidth(G, make_iterator_property_map(&perm[0], index_map, perm[0]))
+                  << std::endl;
+    }
+
+    {
+        //reverse cuthill_mckee_ordering
+        cuthill_mckee_ordering(G, inv_perm.rbegin(), get(vertex_color, G),
+                               make_degree_map(G));
+
+        cout << "Reverse Cuthill-McKee ordering:" << endl;
+        cout << "  ";
+        for (std::vector<Vertex>::const_iterator i = inv_perm.begin();
+                i != inv_perm.end(); ++i)
+            cout << index_map[*i] << " ";
+        cout << endl;
+
+        for (size_type c = 0; c != inv_perm.size(); ++c)
+            perm[index_map[inv_perm[c]]] = c;
+        std::cout << "  bandwidth: "
+                  << bandwidth(G, make_iterator_property_map(&perm[0], index_map, perm[0]))
+                  << std::endl;
+    }
+    return 0;
+}
+*/
+
+
+// -------------  Modifications by Gundolf Haase
+//     std::vector<int> _edges; //!< edges of mesh (vertices ordered ascending)
+
+using namespace boost;
+using namespace std;
+typedef adjacency_list<vecS, vecS, undirectedS,
+        property<vertex_color_t, default_color_type,
+        property<vertex_degree_t, int> > > Graph;
+typedef graph_traits<Graph>::vertex_descriptor Vertex;
+typedef graph_traits<Graph>::vertices_size_type size_type;
+
+typedef std::pair<std::size_t, std::size_t> Pair;
+
+vector<int> cuthill_mckee_reordering(vector<int> const &_edges)
+{
+    size_t const nnodes = *max_element(cbegin(_edges), cend(_edges)) + 1;
+    cout << "NNODES = " << nnodes << endl;
+    //size_t const nedges = _edges.size()/2;
+
+    Graph G(nnodes);
+    for (size_t i = 0; i < _edges.size(); i+=2)
+        add_edge(_edges[i], _edges[i+1], G);
+
+    graph_traits<Graph>::vertex_iterator ui, ui_end;
+
+    property_map<Graph, vertex_degree_t>::type deg = get(vertex_degree, G);
+    for (boost::tie(ui, ui_end) = vertices(G); ui != ui_end; ++ui)
+        deg[*ui] = degree(*ui, G);
+
+    property_map<Graph, vertex_index_t>::type
+    index_map = get(vertex_index, G);
+
+    std::cout << "original bandwidth: " << bandwidth(G) << std::endl;
+
+    std::vector<Vertex> inv_perm(num_vertices(G));
+    //std::vector<size_type> perm(num_vertices(G));
+    std::vector<int> perm(num_vertices(G));
+
+    {
+        Vertex s = vertex(nnodes/2, G);
+        //reverse cuthill_mckee_ordering
+        cuthill_mckee_ordering(G, s, inv_perm.rbegin(), get(vertex_color, G),
+                               get(vertex_degree, G));
+        //cout << "Reverse Cuthill-McKee ordering starting at: " << s << endl;
+        //cout << "  ";
+        //for (std::vector<Vertex>::const_iterator i = inv_perm.begin(); i != inv_perm.end(); ++i)
+            //cout << index_map[*i] << " ";
+        //cout << endl;
+
+        for (size_type c = 0; c != inv_perm.size(); ++c)
+            perm[index_map[inv_perm[c]]] = c;
+        std::cout << "improved bandwidth: "
+                  << bandwidth(G, make_iterator_property_map(&perm[0], index_map, perm[0]))
+                  << std::endl;
+    }
+
+    assert(perm.size()==nnodes);
+
+    return perm;
+}
+
+// -------------  end Modifications
+
+
--- a/mgrid_2/cuthill_mckee_ordering.h
+++ b/mgrid_2/cuthill_mckee_ordering.h
@ -0,0 +1,7 @@
+#pragma once
+
+#include <vector>
+
+//     std::vector<int> _edges [nedges][2]; //!< edges of mesh (vertices ordered ascending)
+std::vector<int> cuthill_mckee_reordering(std::vector<int> const &_edges);
+
--- a/mgrid_2/el_Makefile
+++ b/mgrid_2/el_Makefile
@ -0,0 +1,57 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# COMPILER=GCC_SEQ_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+MAIN = el_main
+SOURCES = ${MAIN}.cpp elements.cpp binaryIO.cpp vdop.cpp geom.cpp\
+	getmatrix.cpp jacsolve.cpp userset.cpp\
+	cuthill_mckee_ordering.cpp
+# 	dexx.cpp debugd.cpp skalar.cpp  vecaccu.cpp accudiag.cpp
+
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= ${MAIN}.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+# -DNDEBUG
+# -pg slows down the code on my laptop when using CLANG_
+LINKFLAGS += -g 
+#-pg
+#CXXFLAGS += -Q --help=optimizers
+#CXXFLAGS += -fopt-info
+
+include ../${COMPILER}default.mk
+
+#############################################################################
+# additional specific cleaning in this directory
+clean_all::
+	@rm -f t.dat*
+
+
+#############################################################################
+# special testing
+# NPROCS	= 4
+#
+TFILE	= t.dat
+# TTMP	= t.tmp
+#
+graph: $(PROGRAM)
+# 	@rm -f $(TFILE).*
+	# next two lines only sequentially
+	./$(PROGRAM)
+	@mv  $(TFILE).000 $(TFILE)
+# 	$(MPIRUN) $(MPIFLAGS) -np $(NPROCS) $(PROGRAM)
+# 	@echo " "; echo "Manipulate data for graphics."; echo " "
+# 	@cat $(TFILE).* > $(TTMP)
+# 	@sort -b -k 2    $(TTMP)   -o $(TTMP).1
+# 	@sort -b -k 1    $(TTMP).1 -o $(TTMP).2
+# 	@awk  -f nl.awk  $(TTMP).2  > $(TFILE)
+# 	@rm -f $(TTMP).* $(TTMP) $(TFILE).*
+#
+	-gnuplot jac.dem
--- a/mgrid_2/el_main.cpp
+++ b/mgrid_2/el_main.cpp
@ -0,0 +1,98 @@
+//		MPI code in C++.
+//		See [Gropp/Lusk/Skjellum, "Using MPI", p.33/41 etc.]
+//		and  /opt/mpich/include/mpi2c++/comm.h  for details
+
+#include "elements.h"
+#include "geom.h"
+#include "getmatrix.h"
+#include "jacsolve.h"
+#include "userset.h"
+#include "vdop.h"
+
+#include <cassert>
+#include <chrono>           // timing
+#include <cmath>
+#include <iostream>
+#include <omp.h>
+using namespace std;
+using namespace std::chrono;  // timing
+
+int main(int /* argc */, char ** /* argv */ )
+{
+//#define PDE_2D
+#ifdef PDE_2D
+    //Mesh const mesh("square_100.txt");
+    Mesh const mesh("square_tiny.txt");
+    //mesh.Debug();
+    //mesh.DebugEdgeBased();
+    //constexpr P1_2d_1dof elem;
+    //const P1_2d_1dof elem;
+
+    const P1_2d elem;
+    FEM_Matrix_2 SK(mesh, elem);           // CRS matrix
+
+    vector<double> uv(SK.Nrows(), 0.0);    // temperature
+    vector<double> fv(SK.Nrows(), 0.0);    // r.h.s.        
+    //SK.CalculateLaplace(fv);
+    SK.CalculateLaplace(fv, rhs_lap2);
+    //SK.CheckMatrix();
+    //SK.Debug();
+    //mesh.SetValues(uv, [](double x, double y) -> double {
+    //return x * x * std::sin(2.5 * M_PI * y);
+    //} );
+    mesh.SetValues(uv, sol_lap2);
+
+    SK.ApplyDirichletBC(uv, fv); // Dirichlet nodes are defines in input file
+
+    //////SK.writeBinary("sparseMatrix.bin");
+    //SK.Debug();
+#else
+    //Mesh mesh("NS_3D.txt");
+    Mesh mesh("GH_NS_3D.txt");
+    FEM_Matrix_2 SK(mesh, P1_3d());                  // CRS matrix
+    //mesh.liftToQuadratic();
+    ////FEM_Matrix_2 SK(mesh,P2_3d(3));                 // CRS matrix
+    //FEM_Matrix_2 SK(mesh,P1_2vec_3d());               // CRS matrix
+
+    vector<double> uv(SK.Nrows(), 0.0);    // temperature
+    vector<double> fv(SK.Nrows(), 0.0);    // r.h.s.
+    //SK.CalculateLaplace(fv);
+    SK.CalculateLaplace(fv,rhs_lap3);
+    //SK.CheckMatrix();
+    //mesh.SetValues(uv, [](double x, double y, double z) -> double {
+    //return x * x * std::sin(2.5 * M_PI * y) + z*z;
+    //} );
+    mesh.SetValues(uv, sol_lap3);
+
+    SK.ApplyDirichletBC_Box(uv, fv, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0); // brute force Dirichlet via box
+    //SK.ApplyDirichletBC_Box(uv,fv,5.0,6.0,5.0,6.0,0.0,0.0); // z==0 ==> Dirichlet-BC
+#endif
+
+
+    //SK.Debug();
+    auto exact_sol(uv);
+
+    ////SK.Mult(fv,uv);
+
+    //auto t3 = system_clock::now(); // start timer
+    JacobiSolve(SK, fv, uv );          // solve the system of equations
+    //auto t4 = system_clock::now();  // stop timer
+    //auto duration = duration_cast<microseconds>(t4 - t3);        // duration in microseconds
+    //double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    //cout << "JacobiSolve: timing in sec. : " << t_diff << endl;
+
+    auto [val, idx] = findLargestAbsError(exact_sol, uv, 1e-6, 10);
+    cout << val << " :idx: " << idx << endl;
+
+    //cout << "exact: " << exact_sol << endl;
+    //cout << "numer: " << uv << endl;
+
+    ////mesh.Visualize(getAbsError(exact_sol, uv));
+
+
+    ////mesh.Write_ascii_matlab("uv.txt", uv);
+    mesh.Visualize(uv);
+
+    return 0;
+}
+
--- a/mgrid_2/elements.cpp
+++ b/mgrid_2/elements.cpp
@ -0,0 +1,233 @@
+#include "elements.h"
+#include "geom.h"
+#include "getmatrix.h"
+#include "userset.h"
+#include "utils.h"
+#include "vdop.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <vector>
+using namespace std;
+
+vector<int> shrinkColumns(vector<int> const &v2, int col2, int col1)
+{
+    assert( v2.size()%col2 == 0 );
+    assert( col1<=col2 );
+    int const nrows = static_cast<int>(v2.size()/col2);
+    vector<int> v1(nrows*col1);
+    
+    for (int k=0; k<nrows; ++k)
+    {
+        for (int d=0; d<col1; ++d)
+        {
+            v1.at(k*col1+d) = v2.at(k*col2+d);
+        }
+    }
+    
+    return v1;
+}
+
+vector<int> mergeColumns(vector<int> const &v1, int col1, vector<int> const &v2, int col2)
+{
+    assert( v2.size()%col2 == 0 );
+    assert( v1.size()%col1 == 0 );
+    assert( v1.size()/col1 == v2.size()/col2 );
+    int const nrows = static_cast<int>(v2.size()/col2);
+    int const ncols = col1+col2;
+    vector<int> vm(nrows*ncols);
+    
+    
+    for (int k=0; k<nrows; ++k)
+    {
+        for (int d=0; d<col1; ++d)
+        {
+            vm.at(k*ncols+d) = v1.at(k*col1+d);       // copy data v1
+        }
+        for (int d=0; d<col2; ++d)
+        {
+            vm.at(k*ncols+d+col1) = v2.at(k*col2+d);  // copy data v2
+        }
+    }
+    
+    return vm;
+}
+
+
+
+vector<int> indexVectorBlowUp(vector<int> const &vin, int ndof_v, int offset)
+{
+    vector<int> wout(vin.size()*ndof_v);
+    
+    for (size_t k=0; k<vin.size(); ++k)
+    {
+        int const kv=k*ndof_v;
+        for (int d=0; d<ndof_v; ++d)
+        {
+            wout[kv+d] = ndof_v*vin[k]+offset+d;
+        }
+    }
+    
+    return wout;
+}
+
+//######################################################################
+
+Element::~Element() {}
+
+vector<int> P1_2d::getGlobalDOFIndices(vector<int> const &ia_geom)  const
+{
+    assert(0 == nDOFs_loc() % nVertices_loc());
+    int const ndof_v = nDOFs_loc()/nVertices_loc();  // DOFs per vertex
+    return indexVectorBlowUp(ia_geom,ndof_v,0);
+}
+
+vector<int> P1_3d::getGlobalDOFIndices(vector<int> const &ia_geom)  const
+{
+    assert(0 == nDOFs_loc() % nVertices_loc());
+    int const ndof_v = nDOFs_loc()/nVertices_loc();  // DOFs per vertex
+    return indexVectorBlowUp(ia_geom,ndof_v,0);
+}
+
+vector<int> P2_3d::getGlobalDOFIndices(vector<int> const &ia_geom)  const 
+{
+    assert(0 == nDOFs_loc() % nVertices_loc());
+    int const ndof_v = nDOFs_loc()/nVertices_loc();  // DOFs per vertex
+    //cout << "LLLLLLLLLLL   " << ndof_v  << endl;
+    return indexVectorBlowUp(ia_geom,ndof_v,0);    
+}
+
+vector<int> P1_2vec_3d::getGlobalDOFIndices(vector<int> const &ia_geom_P2)  const
+{
+    int const nv1(4);              // #vertices tet linear
+    int const nv2(10);             // #vertices tet quadratic    
+    // first: P1 part - scalar
+    vector<int> ia_geom_P1 = shrinkColumns(ia_geom_P2, nv2, nv1);
+    auto const ia_p1(indexVectorBlowUp(ia_geom_P1,1,0));
+    int const nP1_dofs = *max_element(cbegin(ia_p1),cend(ia_p1));  // offset for next numbering
+    
+    // second: P2 part - vector
+    auto const ia_p2(indexVectorBlowUp(ia_geom_P2,3,nP1_dofs));
+    
+    // merging of  ia_p1 and ia_p2
+    int const col1=nv1*1;
+    int const col2=nv2*3;
+    vector<int> ia_dofs(mergeColumns(ia_p1, col1, ia_p2, col2));
+    
+    return ia_dofs;
+}
+
+////######################################################################
+
+//  general routine for lin. triangular elements
+void P1_2d::CalcLaplace(
+    int const ial[3], double const xc[], 
+    vector<vector<double>> &ske, vector<double> &fe,
+    const function<double(double, double)> &f_func  ) const
+{
+    assert(nVertices_loc()==3);
+    assert(nDOFs_loc()==3);
+    //cout << ske.size() << "  " << nDOFs_loc() << endl;
+    assert(static_cast<int>(ske.size())==nDOFs_loc());
+    const int  i1 = 2*ial[0], i2 = 2*ial[1], i3 = 2*ial[2];
+    const double x13 = xc[i3 + 0] - xc[i1 + 0],  y13 = xc[i3 + 1] - xc[i1 + 1],
+                 x21 = xc[i1 + 0] - xc[i2 + 0],  y21 = xc[i1 + 1] - xc[i2 + 1],
+                 x32 = xc[i2 + 0] - xc[i3 + 0],  y32 = xc[i2 + 1] - xc[i3 + 1];
+    const double jac = std::abs(x21 * y13 - x13 * y21);
+
+    ske[0][0] = 0.5 / jac * (y32 * y32 + x32 * x32);
+    ske[0][1] = 0.5 / jac * (y13 * y32 + x13 * x32);
+    ske[0][2] = 0.5 / jac * (y21 * y32 + x21 * x32);
+    ske[1][0] = ske[0][1];
+    ske[1][1] = 0.5 / jac * (y13 * y13 + x13 * x13);
+    ske[1][2] = 0.5 / jac * (y21 * y13 + x21 * x13);
+    ske[2][0] = ske[0][2];
+    ske[2][1] = ske[1][2];
+    ske[2][2] = 0.5 / jac * (y21 * y21 + x21 * x21);
+
+    const double xm    = (xc[i1 + 0] + xc[i2 + 0] + xc[i3 + 0]) / 3.0,
+                 ym    = (xc[i1 + 1] + xc[i2 + 1] + xc[i3 + 1]) / 3.0;
+    fe[0] = fe[1] = fe[2] = 0.5 * jac * f_func(xm, ym) / 3.0;
+}
+
+void P1_2d::CalcLaplace(
+    int const ial[3], double const xc[], 
+    vector<vector<double>> &ske, vector<double> &fe) const
+{
+    CalcLaplace(ial,xc,ske,fe,rhs_lap2);
+}
+
+void P1_3d::CalcLaplace(
+    int const ial[4], double const xc[], 
+    vector<vector<double>> &ske, vector<double> &fe,
+    const function<double(double, double, double)> &f_func) const
+{
+    assert(nVertices_loc()==4);
+    assert(nDOFs_loc()==4);
+    assert(static_cast<int>(ske.size())==nDOFs_loc());
+    assert(3==nDim_loc());
+    // global (geom.) vertex indices
+    const int  i1 = 3*ial[0], i2 = 3*ial[1], i3 = 3*ial[2], i4 = 3*ial[3]; 
+    // coordinates of geom. vertices
+    const double x1{xc[i1+0]}, x2{xc[i2+0]}, x3{xc[i3+0]}, x4{xc[i4+0]};
+    const double y1{xc[i1+1]}, y2{xc[i2+1]}, y3{xc[i3+1]}, y4{xc[i4+1]};
+    const double z1{xc[i1+2]}, z2{xc[i2+2]}, z3{xc[i3+2]}, z4{xc[i4+2]};
+    //  center of gravity of the tetrahedron
+    const double xm{(x1+x2+x3+x4)/4}, ym{(y1+y2+y3+y4)/4}, zm{(z1+z2+z3+z4)/4};
+    
+    // copy-paste from Matlab file tet_elem.m
+    const double detA=x1*y3*z2 - x1*y2*z3 + x2*y1*z3 - x2*y3*z1 - x3*y1*z2 + x3*y2*z1 + x1*y2*z4 - x1*y4*z2 - x2*y1*z4 + x2*y4*z1 + x4*y1*z2 - x4*y2*z1 - x1*y3*z4 + x1*y4*z3 + x3*y1*z4 - x3*y4*z1 - x4*y1*z3 + x4*y3*z1 + x2*y3*z4 - x2*y4*z3 - x3*y2*z4 + x3*y4*z2 + x4*y2*z3 - x4*y3*z2;
+
+    const double gradPhi[4][3] = { //       1/detA*
+        {y3*z2 - y2*z3 + y2*z4 - y4*z2 - y3*z4 + y4*z3, x2*z3 - x3*z2 - x2*z4 + x4*z2 + x3*z4 - x4*z3, x3*y2 - x2*y3 + x2*y4 - x4*y2 - x3*y4 + x4*y3},
+        {y1*z3 - y3*z1 - y1*z4 + y4*z1 + y3*z4 - y4*z3, x3*z1 - x1*z3 + x1*z4 - x4*z1 - x3*z4 + x4*z3, x1*y3 - x3*y1 - x1*y4 + x4*y1 + x3*y4 - x4*y3},
+        {y2*z1 - y1*z2 + y1*z4 - y4*z1 - y2*z4 + y4*z2, x1*z2 - x2*z1 - x1*z4 + x4*z1 + x2*z4 - x4*z2, x2*y1 - x1*y2 + x1*y4 - x4*y1 - x2*y4 + x4*y2},
+        {y1*z2 - y2*z1 - y1*z3 + y3*z1 + y2*z3 - y3*z2, x2*z1 - x1*z2 + x1*z3 - x3*z1 - x2*z3 + x3*z2, x1*y2 - x2*y1 - x1*y3 + x3*y1 + x2*y3 - x3*y2},
+    }; 
+    
+    for (int row=0; row<nVertices_loc(); ++row)
+    {
+        for (int col=0; col<nVertices_loc(); ++col)
+        {
+            ske[row][col] = gradPhi[row][0]*gradPhi[col][0]+gradPhi[row][1]*gradPhi[col][1]+gradPhi[row][2]*gradPhi[col][2];
+            ske[row][col] /= (detA*6.0);        // GH: Parantheses are needed
+        }
+        fe[row] = detA/6.0/4.0*f_func(xm, ym, zm);
+    }
+}
+
+
+void P1_3d::CalcLaplace(
+    int const ial[4], double const xc[], 
+    vector<vector<double>> &ske, vector<double> &fe) const
+{
+    CalcLaplace(ial,xc,ske,fe,rhs_lap3);
+}
+
+
+
+void P1_2vec_3d::CalcLaplace(
+    int const ial[10], double const xc[], 
+    vector<vector<double>> &ske, vector<double> &fe,
+    const function<double(double, double, double)> &f_func ) const     
+{
+    cout << "P1_2vec_3d::CalcLaplace\n";
+    assert(nVertices_loc()==10);
+    assert(nDOFs_loc()==34);
+    //cout << ske.size() << "  " << nDOFs_loc() << endl;
+    assert(static_cast<int>(ske.size())==nDOFs_loc());
+    
+    // Dummy filling for first test
+    for (size_t row=0; row<ske.size(); ++row)
+    {
+        fill(ske[row].begin(),ske[row].end(),-1);
+        ske[row][row] = ske.size()+0.5;
+        fe[row] = 1.0/ske.size();
+    }
+    //   Here we have to fill ske, fe
+
+}
+
+
+
--- a/mgrid_2/elements.h
+++ b/mgrid_2/elements.h
@ -0,0 +1,891 @@
+#pragma once
+
+#include "geom.h"
+#include "getmatrix.h"
+#include "userset.h"
+#include "utils.h"
+#include "vdop.h"
+
+#include <array>
+//#include <cassert>
+#include <type_traits>
+#include <variant>   // class variant
+#include <vector>
+
+//!< linear test functions for P1_3D element
+static
+std::array<std::function<double(double,double,double)>,4> const 
+ phi_P1_3d{
+          [](double xi_0,double xi_1,double xi_2) {return 1-xi_0-xi_1-xi_2;},
+          [](double xi_0,double,double) {return xi_0;},
+          [](double,double xi_1,double) {return xi_1;},
+          [](double,double,double xi_2) {return xi_2;}
+          };
+
+//!< derivatives of linear test functions for P1_3D element
+static
+std::array<std::array<std::function<double(double,double,double)>,3>  ,4> const
+  //dphi_P1_3d{
+              //[](double,double,double) {return -1.0;}, [](double,double,double) {return -1.0;}, [](double,double,double) {return -1.0;} , 
+              //[](double,double,double) {return  1.0;}, [](double,double,double) {return  0.0;}, [](double,double,double) {return  0.0;} , 
+              //[](double,double,double) {return  0.0;}, [](double,double,double) {return  1.0;}, [](double,double,double) {return  0.0;} , 
+              //[](double,double,double) {return  0.0;}, [](double,double,double) {return  0.0;}, [](double,double,double) {return  1.0;}  
+          //};
+// Always use double {{}} for initializer list for array, see https://stackoverflow.com/questions/22501368/why-wasnt-a-double-curly-braces-syntax-preferred-for-constructors-taking-a-std
+  dphi_P1_3d{{
+             {{ [](double,double,double) {return -1.0;}, [](double,double,double) {return -1.0;}, [](double,double,double) {return -1.0;} }}, 
+             {{ [](double,double,double) {return  1.0;}, [](double,double,double) {return  0.0;}, [](double,double,double) {return  0.0;} }}, 
+             {{ [](double,double,double) {return  0.0;}, [](double,double,double) {return  1.0;}, [](double,double,double) {return  0.0;} }}, 
+             {{ [](double,double,double) {return  0.0;}, [](double,double,double) {return  0.0;}, [](double,double,double) {return  1.0;} }} 
+          }};          
+
+//!< quadratic test functions for P2_3D element
+static
+std::array<std::function<double(double,double,double)>,10> const 
+ phi_P2_3d{
+          [](double xi_0,double xi_1,double xi_2) {return phi_P1_3d[0](xi_0,xi_1,xi_2)*(2*phi_P1_3d[0](xi_0,xi_1,xi_2)-1);},
+          [](double xi_0,double xi_1,double xi_2) {return phi_P1_3d[1](xi_0,xi_1,xi_2)*(2*phi_P1_3d[1](xi_0,xi_1,xi_2)-1);},
+          [](double xi_0,double xi_1,double xi_2) {return phi_P1_3d[2](xi_0,xi_1,xi_2)*(2*phi_P1_3d[2](xi_0,xi_1,xi_2)-1);},
+          [](double xi_0,double xi_1,double xi_2) {return phi_P1_3d[3](xi_0,xi_1,xi_2)*(2*phi_P1_3d[3](xi_0,xi_1,xi_2)-1);},
+          [](double xi_0,double xi_1,double xi_2) {return 4*phi_P1_3d[0](xi_0,xi_1,xi_2)*phi_P1_3d[1](xi_0,xi_1,xi_2);},
+          [](double xi_0,double xi_1,double xi_2) {return 4*phi_P1_3d[1](xi_0,xi_1,xi_2)*phi_P1_3d[2](xi_0,xi_1,xi_2);},
+          [](double xi_0,double xi_1,double xi_2) {return 4*phi_P1_3d[0](xi_0,xi_1,xi_2)*phi_P1_3d[2](xi_0,xi_1,xi_2);},
+          [](double xi_0,double xi_1,double xi_2) {return 4*phi_P1_3d[0](xi_0,xi_1,xi_2)*phi_P1_3d[3](xi_0,xi_1,xi_2);},
+          [](double xi_0,double xi_1,double xi_2) {return 4*phi_P1_3d[1](xi_0,xi_1,xi_2)*phi_P1_3d[3](xi_0,xi_1,xi_2);},
+          [](double xi_0,double xi_1,double xi_2) {return 4*phi_P1_3d[2](xi_0,xi_1,xi_2)*phi_P1_3d[3](xi_0,xi_1,xi_2);},
+          };
+
+
+// static polymorphism ?
+// yes, via template argument for FEM_Matrix_2<ELEM>
+// R.Grimm: https://www.modernescpp.com/index.php/dynamic-and-static-polymorphism
+/**
+ * Basis class for finite elements.
+ */ 
+class Element
+{
+    public:
+    constexpr Element(int ndim, int nvertices, int ndof)
+     : _ndim(ndim), _nvertices(nvertices), _ndof(ndof)  {}
+    
+    virtual ~Element();
+    //~Element() = default;
+
+    /**
+     * @return Space dimension.
+     */    
+    constexpr int nDim_loc() const
+    {return _ndim;}
+    
+    /**
+     * @return number of geometric vertices defining the element
+     */    
+    constexpr int nVertices_loc() const
+    {return _nvertices;}
+
+    /**
+     * @return number of degrees of freedem in element
+     */    
+    constexpr int nDOFs_loc() const
+    {return _ndof;}
+                
+    /**
+     * Derives the global indices for the DOFs in each element from the 
+     * global indices of geometric vertices @p ia_geom.
+     * 
+     * @param[in] ia_geom global geometric vertex indices[nelem*_nvertices]
+     * 
+     * @return global indices for the DOFs [nelem*_ndof]
+     */  
+    virtual 
+    std::vector<int> getGlobalDOFIndices(std::vector<int> const &ia_geom) const = 0;
+    
+    /**
+     * Allocates the local finite element matrix.
+     * 
+     * @return matrix [_ndof][_ndof]
+     */  
+    std::vector<std::vector<double>> createElemMatrix() const
+    {
+        return std::vector<std::vector<double>>(_ndof,std::vector<double>(_ndof,0.0));
+    }
+
+    /**
+     * Allocates the local finite element load vector.
+     * 
+     * @return vector [_ndof]
+     */  
+    std::vector<double> createElemVector() const
+    {
+        return std::vector<double>(_ndof,0.0);
+    }
+
+    /**
+     * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+     * of one triangular element with linear shape functions.
+     * @param[in]	ial	node indices of the element vertices [_nvertices]
+     * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coordinates of node k
+     * @param[out] ske	element stiffness matrix [_ndof][_ndof]
+     * @param[out] fe	element load vector [_ndof]
+     */        
+    void CalcLaplace(
+         int const ial[3], 
+         double const xc[], 
+         std::vector<std::vector<double>> &ske, 
+         std::vector<double> &fe) const;
+
+    /**
+     * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+     * of one triangular element with linear shape functions.
+     * @p fe will be calculated according to function @p f_func (x,y,z)
+     * @param[in]	ial	node indices of the element vertices [_nvertices]
+     * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coordinates of node k
+     * @param[out] ske	element stiffness matrix [_ndof][_ndof]
+     * @param[out] fe	element load vector [_ndof]
+     * @param[out] f_func function @p f_func (x,y,z)
+     */      
+    void CalcLaplace(
+         int const ial[3], double const xc[], 
+         std::vector<std::vector<double>> &ske, std::vector<double> &fe,
+         const std::function<double(double, double, double)> &f_func
+         ) const;
+    
+    private:
+    int const _ndim;      //!< space dimension {2,3}
+    int const _nvertices; //!< number of geometric vertices
+    int const _ndof;      //!< number of degrees of freedom;
+    // TODO: Add polynomial degree?
+};
+
+
+/**
+ *  Linear triangular P1-element.
+ *  The local vertex numbering follows Jung/Langer, Tabelle 4.3 (page 256).
+ */ 
+class P1_2d: public Element
+{
+    public:
+    /**
+     * @param[in] ndof_v degrees of freedom per vertex
+     */ 
+    constexpr     
+    P1_2d(int ndof_v=1)
+     : Element(2,3,3*ndof_v) { assert(0 == nDOFs_loc() % nVertices_loc() ); }
+     
+    virtual ~P1_2d() override {}
+     
+    /**
+     * Derives the global indices for the DOFs in each element from the 
+     * global indices of geometric vertices @p ia_geom.
+     * 
+     * @param[in] ia_geom global geometric vertex indices[nelem*_nvertices]
+     * 
+     * @return global indices for the DOFs [nelem*_ndof]
+     */ 
+    std::vector<int> getGlobalDOFIndices(std::vector<int> const &ia_geom) const override;
+
+    /**
+     * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+     * of one triangular element with linear shape functions.
+     * @param[in]	ial	node indices of the element vertices [_nvertices]
+     * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coordinates of node k
+     * @param[out] ske	element stiffness matrix [_ndof][_ndof]
+     * @param[out] fe	element load vector [_ndof]
+     */    
+    void CalcLaplace(
+        int const ial[3], 
+        double const xc[], 
+        std::vector<std::vector<double>> &ske, 
+        std::vector<double> &fe) const;
+
+    /**
+     * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+     * of one triangular element with linear shape functions.
+     * @p fe will be calculated according to function @p f_func (x,y)
+     * @param[in]	ial	node indices of the element vertices [_nvertices]
+     * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coordinates of node k
+     * @param[out] ske	element stiffness matrix [_ndof][_ndof]
+     * @param[out] fe	element load vector [_ndof]
+     * @param[out] f_func function @p f_func (x,y)
+     */      
+    void CalcLaplace(
+         int const ial[3], double const xc[], 
+         std::vector<std::vector<double>> &ske, std::vector<double> &fe,
+         const std::function<double(double, double)> &f_func
+         ) const;
+};
+
+///**
+ //*  Linear triangular P1-element.
+ //*  The local vertex numbering follows Jung/Langer, Tabelle 4.3 (page 256).
+ //*/ 
+//class P1_2d_1dof: public P1_2d
+//{
+    //public:
+    ///**
+     //* one degree of freedom per vertex
+     //*/ 
+    ////constexpr P1_2d_1dof()
+     ////: Element(2,3,3*1) { }
+    //P1_2d_1dof()
+     //: P1_2d(1) { }
+    
+    //virtual ~P1_2d_1dof() override {}
+//};
+
+/**
+ *  Linear tetrahedral P1-element.
+ *  The local vertex numbering follows Jung/Langer, Tabelle 4.3 (page 256).
+ */ 
+class P1_3d: public Element
+{
+    public:
+    /**
+     * @param[in] ndof_v degrees of freedom per vertex
+     */
+    constexpr
+    P1_3d(int ndof_v=1)
+     : Element(3,4,4*ndof_v) { assert(0 == nDOFs_loc() % nVertices_loc() ); }
+     
+    virtual ~P1_3d() override {}
+     
+    /**
+     * Derives the global indices for the DOFs in each element from the 
+     * global indices of geometric vertices @p ia_geom.
+     * 
+     * @param[in] ia_geom global geometric vertex indices[nelem*_nvertices]
+     * 
+     * @return global indices for the DOFs [nelem*_ndof]
+     */ 
+    std::vector<int> getGlobalDOFIndices(std::vector<int> const &ia_geom) const override;
+    
+    /**
+     * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+     * of one triangular element with linear shape functions.
+     * @param[in]	ial	node indices of the element vertices [_nvertices]
+     * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coordinates of node k
+     * @param[out] ske	element stiffness matrix [_ndof][_ndof]
+     * @param[out] fe	element load vector [_ndof]
+     */    
+    void CalcLaplace(int const ial[4], double const xc[], std::vector<std::vector<double>> &ske, std::vector<double> &fe) const;
+
+    /**
+     * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+     * of one triangular element with linear shape functions.
+     * @p fe will be calculated according to function @p f_func (x,y,z)
+     * @param[in]	ial	node indices of the element vertices [_nvertices]
+     * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coordinates of node k
+     * @param[out] ske	element stiffness matrix [_ndof][_ndof]
+     * @param[out] fe	element load vector [_ndof]
+     * @param[out] f_func function @p f_func (x,y,z)
+     */      
+    void CalcLaplace(
+         int const ial[4], double const xc[], 
+         std::vector<std::vector<double>> &ske, std::vector<double> &fe,
+         const std::function<double(double, double, double)> &f_func    ) const;    
+};
+
+/**
+ *  Quadratic tetrahedral P2-element.
+ */ 
+class P2_3d: public Element
+{
+    public:
+    /**
+     * @param[in] ndof_v degrees of freedom per vertex
+     */
+    constexpr
+    P2_3d(int ndof_v=1)
+     : Element(3,10,10*ndof_v) { assert(0 == nDOFs_loc() % nVertices_loc() ); }
+     
+    virtual ~P2_3d() override {}
+
+    /**
+     * Derives the global indices for the DOFs in each element from the 
+     * global indices of geometric vertices @p ia_geom.
+     * 
+     * @param[in] ia_geom global geometric vertex indices[nelem*_nvertices]
+     * 
+     * @return global indices for the DOFs [nelem*_ndof]
+     */ 
+    std::vector<int> getGlobalDOFIndices(std::vector<int> const &ia_geom) const override;
+};
+
+
+/**
+ *  Combined P1-P2 tetrahedral element with vector values in P2 vertices.
+ * 
+ *  Local numbering of dofs: 4 scalar values (linear test function) followed 
+ *  by 10 vector values (quadratic test function) consisting of 3 components. 
+ *
+ * 
+ */ 
+class P1_2vec_3d: public Element
+{
+    public:
+    /**
+     * @param[in] ndof_v  max. degrees of freedom per vertex
+     */
+    constexpr
+    P1_2vec_3d(int ndof_v=3)
+     : Element(3,10,4+10*ndof_v) {  }
+     
+    virtual ~P1_2vec_3d() override {}
+
+    /**
+     * Derives the global indices for the DOFs in each element from the 
+     * global indices of geometric vertices @p ia_geom.
+     * 
+     * @param[in] ia_geom global geometric vertex indices[nelem*_nvertices]
+     * 
+     * @return global indices for the DOFs [nelem*_ndof]
+     */ 
+    std::vector<int> getGlobalDOFIndices(std::vector<int> const &ia_geom) const override;
+
+    /**
+     * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+     * of one triangular element with linear shape functions.
+     * @param[in]	ial	node indices of the element vertices [_nvertices]
+     * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coordinates of node k
+     * @param[out] ske	element stiffness matrix [_ndof][_ndof]
+     * @param[out] fe	element load vector [_ndof]
+     * @param[out] f_func function @p f_func (x,y,z)
+     */    
+    void CalcLaplace(int const ial[10], double const xc[], 
+         std::vector<std::vector<double>> &ske, std::vector<double> &fe,
+         const std::function<double(double, double, double)> &f_func    ) const;      
+    
+};
+
+/**
+ * Copies only the first @p col1 columns from matrix @p v2 [n* @p col2]  with @p col1<= @p col2.
+ *  
+ * @param[in] v2   data[n*col2]
+ * @param[in] col1 #columns in output
+ * @param[in] col2 #columns in input
+ * @return    data[n*col1]
+ */ 
+std::vector<int> shrinkColumns(std::vector<int> const &v2, int col2, int col1);
+
+/**
+ * Merges data @p v1 and data @p into a new vector/matrix.
+ *  
+ * @param[in] v1   data[n*col1]
+ * @param[in] col1 #columns in @p v1
+ * @param[in] v2   data[n*col2]
+ * @param[in] col2 #columns in @p v2
+ * @return    data[n*(col1+col2)]
+ */
+std::vector<int> mergeColumns(std::vector<int> const &v1, int col1, std::vector<int> const &v2, int col2);
+
+/**
+ * An index vector @p vin for scalar values is transferred into 
+ * an index vector for @p ndof_v DOFs per scalar.
+ * An optional index @p offset is applied (useful for merging two index vectors).
+ *  
+ * @param[in] vin    index[n]
+ * @param[in] ndof_v #DOFs per original index
+ * @param[in] offset index offset for output indices
+ * @return    index[n*ndof_v]
+ * 
+ * @see mergeColumns
+ */
+std::vector<int> indexVectorBlowUp(std::vector<int> const &vin, int ndof_v=1, int offset=0);
+
+
+//######################################################################
+
+/**
+ * FEM Matrix in CRS format (compressed row storage; also named CSR),
+ * see an <a href="https://en.wikipedia.org/wiki/Sparse_matrix">introduction</a>.
+ */
+template <class ELEM=P1_3d()>
+class FEM_Matrix_2: public CRS_Matrix
+{
+    public:
+       /**
+        * Initializes the CRS matrix structure from the given discretization in @p mesh.
+        *
+        * The sparse matrix pattern is generated but the values are 0.
+        *
+        * @param[in] mesh given discretization
+        *
+        * @warning A reference to the discretization @p mesh is stored inside this class.
+        *          Therefore, changing @p mesh outside requires also
+        *          to call method @p Derive_Matrix_Pattern explicitly.
+        *
+        * @see Derive_Matrix_Pattern
+        */
+       explicit FEM_Matrix_2(Mesh const & mesh, ELEM const &elem=P1_3d());
+       //explicit FEM_Matrix_2(Mesh const & mesh);
+
+       FEM_Matrix_2(FEM_Matrix_2 const &) = default;
+
+      /**
+        * Destructor.
+        */
+       ~FEM_Matrix_2() override;
+       
+    /**
+     * Read DOF connectivity information (g1,g2,g3)_i.
+     * @return connectivity vector [nelems*ndofs].
+     */
+    [[nodiscard]] const std::vector<int>  &GetConnectivity() const
+    {
+        return _ia_dof;
+    }
+    
+    /**
+     * Read DOF connectivity information (g1,g2,g3)_i.
+     * @return connectivity vector [nelems*ndofs].
+     */
+    [[nodiscard]] const std::vector<int>  &GetConnectivityGeom() const
+    {
+        return _mesh.GetConnectivity();
+    }
+    
+
+    /**
+     * Determines all node to node connections from the dof based mesh.
+      *
+      * Faster than @p Node2NodeGraph_1().
+      *
+     * @return vector[k][] containing all connections of dof k, including to itself.
+     */    
+    std::vector<std::vector<int>> Node2NodeGraph() const
+    {
+        //std::cout << Nelems() << " " << NdofsElement() << " " << GetConnectivity().size() << std::endl;
+        return ::Node2NodeGraph(Nelems(),NdofsElement(),GetConnectivity());
+    }
+
+    
+    /**
+     * Checks whether the mesh is compatible with the choosen finite element type.
+     * @return true iff compatible.
+     */
+    bool CheckCompatibility() const;
+    
+    constexpr int Nelems() const
+    { return _mesh.Nelems();}
+    
+    //constexpr int Nnodes() const
+    //{ return _mesh.Nnodes();}
+    
+    constexpr int NdofsElement() const
+    { return _elem.nDOFs_loc(); }
+    
+    constexpr int NverticesElement() const
+    { return _elem.nVertices_loc(); }
+    
+    constexpr int Ndims() const
+    { return _elem.nDim_loc(); }    
+
+       /**
+        * Generates the sparse matrix pattern and overwrites the existing pattern.
+        *
+        * The sparse matrix pattern is generated but the values are 0.
+       */
+       void Derive_Matrix_Pattern();
+       //{
+           ////Derive_Matrix_Pattern_slow();
+           //Derive_Matrix_Pattern_fast();
+           //CheckRowSum();
+       //}
+       //void Derive_Matrix_Pattern_fast();
+       //void Derive_Matrix_Pattern_slow();
+
+
+        /**
+        * Calculates the entries of f.e. stiffness matrix for the Laplace operator
+        * and load/rhs vector @p f.
+        * No memory is allocated.
+        *
+        * @param[in,out] f (preallocated) rhs/load vector
+        * @warning Only linear elements (P1_2d, P1_3d) are supported.
+        * @see P1_2d
+        * @see P1_3d
+        */
+       void CalculateLaplace(std::vector<double> &f);
+       
+        /**
+        * Calculates the entries of f.e. stiffness matrix for the Laplace operator in 2D
+        * and load/rhs vector @p f according to function @p f_func,
+        *
+        * @param[in,out] f (preallocated) rhs/load vector
+        * @param[in]     f_func function f(x,y)
+        */
+       void CalculateLaplace(
+           std::vector<double> &f, 
+           std::function<double(double, double)> const &f_func);
+
+        /**
+        * Calculates the entries of f.e. stiffness matrix for the Laplace operator in 3D
+        * and load/rhs vector @p f according to function @p f_func,
+        *
+        * @param[in,out] f (preallocated) rhs/load vector
+        * @param[in]     f_func function f(x,y,z)
+        */
+       void CalculateLaplace(
+           std::vector<double> &f, 
+           std::function<double(double, double, double)> const &f_func);
+
+      /**
+        * Adds the element stiffness matrix @p ske and the element load vector @p fe
+        * of one triangular element with linear shape functions to the appropriate positions in
+        * the stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik).
+        *
+        * @param[in]     ial   node indices of the three element vertices
+        * @param[in]     ske   element stiffness matrix
+        * @param[in]     fe    element load vector
+        * @param[in,out] f	   distributed local vector storing the right hand side
+        *
+        * @warning Algorithm assumes  linear triangular elements (ndof_e==3).
+       */
+       //void AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], std::vector<double> &f);
+       void AddElem(
+           int const ial[], 
+           std::vector<std::vector<double>> const& ske, 
+           std::vector<double> const &fe, 
+           std::vector<double> &f);
+
+       /**
+        * Applies Dirichlet boundary conditions to stiffness matrix and to load vector @p f.
+        * The <a href="https://www.jstor.org/stable/2005611?seq=1#metadata_info_tab_contents">penalty method</a>
+        * is used for incorporating the given values @p u.
+        *
+        * @param[in]     u (global) vector with Dirichlet data
+        * @param[in,out] f load vector
+        */
+       void ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f);
+
+       /**
+        * Applies Dirichlet boundary conditions to stiffness matrix and to load vector @p f.
+        * The <a href="https://www.jstor.org/stable/2005611?seq=1#metadata_info_tab_contents">penalty method</a>
+        * is used for incorporating the given values @p u 
+        * at the surface of the bounding box [@p xl, @p xh]x[@p yl, @p yh]x[@p zl, @p zh].
+        * 
+        * Skipping @p zl and @p zh reduced the box to a rectangle in 2D.
+        *
+        * @param[in]     u  (global) vector with Dirichlet data
+        * @param[in,out] f  load vector
+        * @param[in]     xl lower  value x-bounds
+        * @param[in]     xh higher value x-bounds
+        * @param[in]     yl lower  value y-bounds
+        * @param[in]     yh higher value y-bounds
+        * @param[in]     zl lower  value z-bounds
+        * @param[in]     zh higher value z-bounds
+        */
+       void ApplyDirichletBC_Box(std::vector<double> const &u, std::vector<double> &f,
+            double xl, double xh, double yl, double yh, 
+            double zl=-std::numeric_limits<double>::infinity(), 
+            double zh= std::numeric_limits<double>::infinity() );
+
+    private:
+       Mesh const     & _mesh;   //!< discretization (contains element connectivity regarding geometric vertices)
+       ELEM const       _elem;   //!< element type
+       std::vector<int> _ia_dof; //!< element connectivity regarding DOFs in element
+
+};
+
+//######################################################################
+
+template <class ELEM>
+FEM_Matrix_2<ELEM>::FEM_Matrix_2(Mesh const &mesh, ELEM const &elem)
+//FEM_Matrix_2<ELEM>::FEM_Matrix_2(Mesh const &mesh)
+    : CRS_Matrix(), _mesh(mesh), _elem(elem), _ia_dof()
+    //: CRS_Matrix(), _mesh(mesh), _elem(ELEM()), _ia_dof()
+{
+    assert(CheckCompatibility());
+    
+    _ia_dof = _elem.getGlobalDOFIndices(GetConnectivityGeom());
+    Derive_Matrix_Pattern();                      // requires _ia_dof
+    return;
+}
+
+template <class ELEM>
+FEM_Matrix_2<ELEM>::~FEM_Matrix_2()
+{}
+
+template <class ELEM>
+bool FEM_Matrix_2<ELEM>::CheckCompatibility() const
+{
+    int const ndim_fem  = _elem.nDim_loc();
+    int const nDOF_fem  = _elem.nDOFs_loc();
+    int const nvert_fem = _elem.nVertices_loc();
+    
+    int const ndim = _mesh.Ndims();
+    int const nDOF = _mesh.NdofsElement();
+    int const nvert= _mesh.NverticesElement();
+    std::cout << "##  FEM_Matrix_2::CheckCompatibility()  ##" << std::endl;
+    
+    bool ok{true};
+    if (ndim_fem!=ndim)
+    {
+        std::cout << "Space dimensions mismatch: " << ndim << " vs. " << ndim_fem << std::endl;
+        ok = false;
+    }
+    if (ok && nvert_fem!=nvert)
+    {
+        std::cout << "#local vertices mismatch: " << nvert << " vs. " << nvert_fem << std::endl;
+        ok = false;
+    }
+    if (ok && nDOF_fem < nDOF)
+    {
+        std::cout << "#local DOFs to small: " << nDOF << " vs. " << nDOF_fem << std::endl;
+        ok = false;
+    }
+    
+    return true;
+}
+
+template <class ELEM>
+void FEM_Matrix_2<ELEM>::Derive_Matrix_Pattern()
+{
+    if (_ia_dof.empty())
+    {
+        _ia_dof = _elem.getGlobalDOFIndices(GetConnectivityGeom());
+    }
+    std::cout << "\n############   FEM_Matrix::Derive_Matrix_Pattern ";
+    MyTimer tstart;    //tstart.tic();
+    
+    //int const nelem(Nelems());
+    //int const ndof_e(NdofsElement());
+    auto const &ia(GetConnectivity());
+    
+    //std::cout << ia << std::endl;
+    
+//  Determine the number of matrix rows
+    //_nrows = *max_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem);
+    _nrows = *max_element(ia.cbegin(), ia.cend()); // GH??
+    ++_nrows;                                 // node numberng: 0 ... nnode-1
+    //assert(*min_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem) == 0); // numbering starts with 0 ?
+    assert(*min_element(ia.cbegin(), ia.cend()) == 0); // numbering starts with 0 ? // GH??
+
+// CSR data allocation
+    _id.resize(_nrows + 1);                  // Allocate memory for CSR row pointer
+//##########################################################################
+    //auto const v2v = _mesh.Node2NodeGraph();               // GH TODO:  Hier Node2NodeGraph mit _ia_dof
+    auto const v2v = Node2NodeGraph();               // GH TODO:  Hier Node2NodeGraph mit _ia_dof
+    _nnz = 0;                         // number of connections
+    _id[0] = 0;                       // start of matrix row zero
+    for (size_t v = 0; v < v2v.size(); ++v ) {
+        _id[v + 1] = _id[v] + v2v[v].size();
+        _nnz += v2v[v].size();
+    }
+    assert(_nnz == _id[_nrows]);             // GH: TODO
+    _sk.resize(_nnz,-12345.0);               // Allocate memory for CSR values vector
+
+// CSR data allocation
+    _ik.resize(_nnz);                        // Allocate memory for CSR column index vector
+// Copy column indices
+    int kk = 0;
+    for (const auto & v : v2v) {
+        for (size_t vi = 0; vi < v.size(); ++vi) {
+            _ik[kk] = v[vi];
+            ++kk;
+        }
+    }
+    _ncols = *max_element(_ik.cbegin(), _ik.cend());  // maximal column number
+    ++_ncols;                                         // node numbering: 0 ... nnode-1
+    //cout << _nrows << "  " << _ncols << endl;
+    assert(_ncols == _nrows);
+
+    std::cout << "finished in  " <<  tstart.toc()  << " sec.    ########\n";
+
+    return;
+}
+
+template <class ELEM>
+void FEM_Matrix_2<ELEM>::AddElem(int const ial[], std::vector<std::vector<double>> const &ske, std::vector<double> const &fe, std::vector<double> &f)
+{
+    //assert(NdofsElement() == 3);               // only for triangular, linear elements
+    for (int i = 0; i < NdofsElement(); ++i) {
+        const int ii  = ial[i];           // row ii (global index)
+        for (int j = 0; j < NdofsElement(); ++j) {     // no symmetry assumed
+            const int jj = ial[j];        // column jj (global index)
+            const int ip = fetch(ii, jj);       // find column entry jj in row ii
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+            if (ip < 0) {      // no entry found !!
+                std::cout << "Error in AddElem: (" << ii << "," << jj << ") ["
+                          << ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
+                assert(ip >= 0);
+            }
+#endif
+            #pragma omp atomic
+            _sk[ip] += ske[i][j];
+        }
+        #pragma omp atomic
+        f[ii] += fe[i];
+    }
+}
+
+template <class ELEM>
+void FEM_Matrix_2<ELEM>::CalculateLaplace(
+    std::vector<double> &f, 
+    std::function<double(double, double)> const &f_func)          // 2D
+{
+    assert(_elem.nDim_loc()==2);                                  // 2D
+    std::cout << "\n############   FEM_Matrix::CalculateLaplace f_func 2D";
+    //std::cout << _nnz << " vs. " << _id[_nrows] << "  " << _nrows<< std::endl;
+    assert(_nnz == _id[_nrows]);
+    
+    MyTimer tstart;    //tstart.tic();
+    fill(_sk.begin(),_sk.end(),0.0);              // set     matrix entries to zero
+    fill(  f.begin(),  f.end(),0.0);              // set rhs vector entries to zero
+
+    auto const nelem    = Nelems();
+    auto const nvert_e  = NverticesElement();     // geom. vertices per element
+    auto const ndof_e   = NdofsElement();         // DOFs per element
+    auto const &ia_geom = GetConnectivityGeom();  // geometric connectivity
+    auto const &ia_crs  = GetConnectivity();      // DOF connectivity in CRS matrix
+    auto const &xc      = _mesh.GetCoords();      // Coordinates
+
+    #pragma omp parallel
+    {
+    auto ske(_elem.createElemMatrix());
+    auto fe (_elem.createElemVector());
+    //#pragma omp parallel for private(ske,fe)    // GH: doesn't work correctly with vector<vector<double>>
+    #pragma omp for    
+    for (int i = 0; i < nelem; ++i) {             //  Loop over all elements
+        _elem.CalcLaplace(ia_geom.data() + nvert_e * i, xc.data(), ske, fe,f_func);
+        AddElem(ia_crs.data() + ndof_e * i, ske, fe, f);
+    }
+    }
+
+    std::cout << "finished in  " <<  tstart.toc()  << " sec.    ########\n";
+    //Debug();
+    return;
+}
+
+template <class ELEM>
+void FEM_Matrix_2<ELEM>::CalculateLaplace(
+    std::vector<double> &f, 
+    std::function<double(double, double, double)> const &f_func)  // 3D
+    //const std::variant<std::function<double(double, double)>, std::function<double(double, double, double)>> &f_func)
+{
+    assert(_elem.nDim_loc()==3);                                  // 3D
+    std::cout << "\n############   FEM_Matrix::CalculateLaplace f_func 3D";
+    //std::cout << _nnz << " vs. " << _id[_nrows] << "  " << _nrows<< std::endl;
+    assert(_nnz == _id[_nrows]);
+    
+    MyTimer tstart;    //tstart.tic();
+    fill(_sk.begin(),_sk.end(),0.0);              // set     matrix entries to zero
+    fill(  f.begin(),  f.end(),0.0);              // set rhs vector entries to zero
+
+    auto const nelem    = Nelems();
+    auto const nvert_e  = NverticesElement();     // geom. vertices per element
+    auto const ndof_e   = NdofsElement();         // DOFs per element
+    auto const &ia_geom = GetConnectivityGeom();  // geometric connectivity
+    auto const &ia_crs  = GetConnectivity();      // DOF connectivity in CRS matrix
+    auto const &xc      = _mesh.GetCoords();      // Coordinates
+
+    #pragma omp parallel
+    {
+    auto ske(_elem.createElemMatrix());
+    auto fe (_elem.createElemVector());
+    //#pragma omp parallel for private(ske,fe)    // GH: doesn't work correctly with vector<vector<double>>
+    #pragma omp for    
+    for (int i = 0; i < nelem; ++i) {             //  Loop over all elements
+        _elem.CalcLaplace(ia_geom.data() + nvert_e * i, xc.data(), ske, fe,f_func);
+        AddElem(ia_crs.data() + ndof_e * i, ske, fe, f);
+    }
+    }
+
+    std::cout << "finished in  " <<  tstart.toc()  << " sec.    ########\n";
+    //Debug();
+    return;
+}
+
+// only for linear elements and scalar functions
+template <class ELEM>
+void FEM_Matrix_2<ELEM>::CalculateLaplace(std::vector<double> &f)
+{
+    std::cout << "\n############   FEM_Matrix::CalculateLaplace ";
+    if constexpr(std::is_same<ELEM,P1_3d>::value)
+    {
+        CalculateLaplace(f, rhs_lap3 );                           // 3D standard rhs
+    }
+    else if constexpr(std::is_same<ELEM,P1_2d>::value)
+    { 
+        CalculateLaplace(f, rhs_lap2 );                           // 2D standard rhs
+    }
+    else
+    {
+        assert(false);
+    }
+}
+
+
+// 2D-part: copy from getmatrix.cpp
+
+template <class ELEM>
+void FEM_Matrix_2<ELEM>::ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f)
+{
+    if (2==_elem.nDim_loc())
+    {
+        auto const idx = _mesh.Index_DirichletNodes();      // GH: not available in 3D
+        int const nidx = idx.size();
+
+        for (int i = 0; i < nidx; ++i) {
+            int const row = idx[i];
+            for (int ij = _id[row]; ij < _id[row + 1]; ++ij) {
+                int const col = _ik[ij];
+                if (col == row) {
+                    _sk[ij] = 1.0;
+                    f[row]  = u[row];
+                 }
+                else {
+                    int const id1 = fetch(col, row); // Find entry (col,row)
+                    assert(id1 >= 0);
+                    f[col] -= _sk[id1] * u[row];
+                    _sk[id1] = 0.0;
+                    _sk[ij]  = 0.0;
+                }
+            }
+        }
+    }
+    else
+    {
+        std::cout << "FEM_Matrix_2<ELEM>::ApplyDirichletBC not implemented in 3D" << std::endl;
+        assert(false);
+    }
+
+    return;
+}
+
+template <class ELEM>
+void FEM_Matrix_2<ELEM>::ApplyDirichletBC_Box(std::vector<double> const &u, std::vector<double> &f,
+            double xl, double xh, double yl, double yh, double zl, double zh )
+{
+    std::vector<int> idx;
+    if (3==_mesh.Ndims())
+    {
+        idx = _mesh.Index_DirichletNodes_Box(xl, xh, yl, yh, zl, zh);
+        //std::cout << "####  3D idx: " << idx << std::endl;
+    }
+    else if (2==_mesh.Ndims())
+    {
+        idx = _mesh.Index_DirichletNodes_Box(xl, xh, yl, yh);
+        //std::cout << "####  2D idx: " << idx << std::endl;
+    }
+    else
+    {
+        assert(false);
+    }
+    
+    double const PENALTY = 1e6;
+    int const nidx = idx.size();
+
+    for (int row=0; row<nidx; ++row)
+    {
+        int const k = idx[row];
+		int const id1 = fetch(k, k); // Find diagonal entry of row
+		assert(id1 >= 0);
+		_sk[id1] += PENALTY;		// matrix weighted scaling feasible
+        f[k] += PENALTY * u[k];
+    }
+
+    return;
+}
+
+
--- a/mgrid_2/generateCRS.cpp
+++ b/mgrid_2/generateCRS.cpp
@ -0,0 +1,65 @@
+#include "binaryIO.h"
+#include "geom.h"
+#include "getmatrix.h"
+#include "jacsolve.h"
+#include "userset.h"
+#include "vdop.h"
+
+#include <cassert>
+#include <chrono>           // timing
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <omp.h>
+#include <vector>
+using namespace std;
+using namespace std::chrono;  // timing
+
+///  Generates sparse matrices (CRS) and right hand side for the Laplace on the given mesh
+///  and stores them in binary files.
+///
+///  - meshname("square_100")   reads ASCii mesh file square_100.txt
+///  - Right hand side is defined via function fNice(x,y) in userset.h
+///  - the initial mesh is refined (nmesh-1) 2 times by standard.
+///  - ./generateCRS nmesh           --> nmesh-1 refinement steps
+
+int main(int argc, char **argv )
+{
+	const string meshname("square_100");
+	//const string meshname("square_06");
+    int nmesh = 3;
+    if (argc > 1)  nmesh = atoi(argv[1]);
+
+    Mesh const mesh_c(meshname+".txt");
+    bool ba = mesh_c.checkObtuseAngles();
+    if (ba) cout << "mesh corrected" << endl;
+    //mesh_c.Debug(); cout << endl << "#############################\n";
+
+    gMesh_Hierarchy ggm(mesh_c, nmesh);
+
+    for (size_t ll = 0; ll < ggm.size(); ++ll)
+    {
+        const Mesh &mesh = ggm[ll];
+		cout << "Matrix for mesh " << ll << " with " << mesh.Nnodes() << " nodes."<< endl;
+        FEM_Matrix SK(mesh);
+        vector<double> uv(SK.Nrows(), 0.0);    // temperature
+        vector<double> fv(SK.Nrows(), 0.0);    // r.h.s.
+        SK.CalculateLaplace(fv);
+        //SK.CheckRowSum();
+        SK.CheckMatrix();
+        mesh.SetValues(uv, [](double x, double y) -> double
+        {
+            return x *x * std::sin(2.5 * M_PI * y);
+        } );
+        SK.ApplyDirichletBC(uv, fv);
+
+        const string binname(meshname+"_"+to_string(ll)+"_mat"+".bin");
+        SK.writeBinary(binname);
+        const string rhsname(meshname+"_"+to_string(ll)+"_rhs"+".bin");
+        write_binVector(rhsname,fv);
+        //SK.Debug();
+    }
+
+    return 0;
+}
+
--- a/mgrid_2/geom.cpp
+++ b/mgrid_2/geom.cpp
--- a/mgrid_2/geom.h
+++ b/mgrid_2/geom.h
@ -0,0 +1,897 @@
+#pragma once
+#include <array>
+#include <functional>             // function; C++11
+#include <iostream>
+#include <memory>                  // shared_ptr
+#include <string>
+#include <vector>
+
+/**
+ * Basis class for finite element meshes.
+ */
+class Mesh
+{
+public:
+    /**
+      * Constructor initializing the members with default values.
+      *
+      * @param[in] ndim  space dimensions (dimension for coordinates)
+      * @param[in] nvert_e  number of vertices per element (dimension for connectivity)
+      * @param[in] ndof_e   degrees of freedom per element (= @p nvert_e for linear elements)
+      * @param[in] nedge_e  number of edges per element (= @p nvert_e for linear elements in 2D)
+      */
+    explicit Mesh(int ndim, int nvert_e = 0, int ndof_e = 0, int nedge_e = 0);
+    
+    Mesh() : Mesh(0) {}
+
+    Mesh(Mesh const &) = default;
+
+    Mesh &operator=(Mesh const &) = delete;
+
+    /**
+     * Destructor.
+     *
+     * See clang warning on
+     * <a href="https://stackoverflow.com/questions/28786473/clang-no-out-of-line-virtual-method-definitions-pure-abstract-c-class/40550578">weak-vtables</a>.
+     */
+    virtual ~Mesh();
+
+    /**
+     * Reads mesh data from a binary file.
+     *
+     * File format, see ascii_write_mesh.m
+     *
+     * @param[in] fname file name
+    */
+    explicit Mesh(std::string const &fname);
+
+    /**
+     * Reads mesh data from a binary file.
+     *
+     * File format, see ascii_write_mesh.m
+     *
+     * @param[in] fname file name
+    */
+    void ReadVertexBasedMesh(std::string const &fname);
+
+    /**
+     * Number of finite elements in (sub)domain.
+     * @return number of elements.
+     */
+    [[nodiscard]] int Nelems() const
+    {
+        return _nelem;
+    }
+
+    /**
+     * Global number of vertices for each finite element.
+     * @return number of vertices per element.
+     */
+    [[nodiscard]] int NverticesElement() const
+    {
+        return _nvert_e;
+    }
+
+    /**
+     * Global number of degrees of freedom (dof) for each finite element.
+     * @return degrees of freedom per element.
+     */
+    [[nodiscard]] int NdofsElement() const
+    {
+        return _ndof_e;
+    }
+
+    /**
+     * Number of vertices in mesh.
+     * @return number of vertices.
+     */
+    [[nodiscard]] int Nnodes() const
+    {
+        return _nnode;
+    }
+
+    /**
+     * Space dimension.
+     * @return number of dimensions.
+     */
+    [[nodiscard]] int Ndims() const
+    {
+        return _ndim;
+    }
+
+    /**
+     * (Re-)Allocates memory for the geometric element connectivity and redefines the appropriate dimensions.
+     *
+     * @param[in] nelem    number of elements
+     * @param[in] nvert_e  number of vertices per element
+     */
+    void Resize_Connectivity(int nelem, int nvert_e)
+    {
+        SetNelem(nelem);               // number of elements
+        SetNverticesElement(nvert_e);  // vertices per element
+        _ia.resize(nelem * nvert_e);
+    }
+
+    /**
+     * Read geometric connectivity information (g1,g2,g3)_i.
+     * @return connectivity vector [nelems*ndofs].
+     */
+    [[nodiscard]] const std::vector<int>  &GetConnectivity() const
+    {
+        return _ia;
+    }
+
+    /**
+     * Access/Change geometric connectivity information (g1,g2,g3)_i.
+     * @return connectivity vector [nelems*ndofs].
+     */
+    std::vector<int>  &GetConnectivity()
+    {
+        return _ia;
+    }
+
+    /**
+     * (Re-)Allocates memory for coordinates and redefines the appropriate dimensions.
+     *
+     * @param[in] nnodes    number of nodes
+     * @param[in] ndim      space dimension
+     */
+    void Resize_Coords(int nnodes, int ndim)
+    {
+        SetNnode(nnodes);       // number of nodes
+        SetNdim(ndim);          // space dimension
+        _xc.resize(nnodes * ndim);
+    }
+
+    /**
+     * Read coordinates of vertices (x,y)_i.
+     * @return coordinates vector [nnodes*2].
+     */
+    [[nodiscard]] const std::vector<double> &GetCoords() const
+    {
+        return _xc;
+    }
+
+    /**
+     * Access/Change coordinates of vertices (x,y)_i.
+     * @return coordinates vector [nnodes*2].
+     */
+    std::vector<double> &GetCoords()
+    {
+        return _xc;
+    }
+
+    /**
+     * Calculate values in scalar vector @p v via function @p func(x,y)
+     * @param[in] v     scalar vector
+     * @param[in] func  function of (x,y) returning a double value.
+     */
+    void SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const;
+
+    /**
+     * Calculate values in scalar vector @p v via function @p func(x,y,z)
+     * @param[in] v     scalar  vector
+     * @param[in] func  function of (x,y,z) returning a double value.
+     */
+    void SetValues(std::vector<double> &v, const std::function<double(double, double, double)> &func) const;
+
+    /**
+     * Calculate values in vector valued vector @p v via functions @p func?(x,y,z)
+     * @param[in] vvec  vector
+     * @param[in] func0  function of (x,y,z) returning a double value.
+     * @param[in] func1  function of (x,y,z) returning a double value.
+     * @param[in] func2  function of (x,y,z) returning a double value.
+     */
+     void SetValues(std::vector<double> &vvec, 
+        const std::function<double(double, double, double)> &func0,
+        const std::function<double(double, double, double)> &func1,
+        const std::function<double(double, double, double)> &func2 ) const;
+
+    /**
+     * Prints the information for a finite element mesh
+     */
+    void Debug() const;
+
+    /**
+     * Prints the edge based information for a finite element mesh
+     */
+    void DebugEdgeBased() const;
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions.
+     * 
+     * All boundary nodes are considered as Dirchlet nodes. 
+     * @return index vector.
+     * @warning Not available in 3D. 
+     *          Vector _bedges is currently not included in the 3D input file.
+     */
+    [[nodiscard]] virtual std::vector<int> Index_DirichletNodes() const;
+    
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions.
+     * 
+     * All discretization nodes located at the perimeter of rectangle
+     * [@p xl, @p xh]x[@p yl, @p yh]
+     * are defined as Dirichlet nodes.
+     * 
+     * @param[in] xl lower  value x-bounds
+     * @param[in] xh higher value x-bounds
+     * @param[in] yl lower  value y-bounds
+     * @param[in] yh higher value y-bounds
+     * @return index vector.
+     */
+    [[nodiscard]] 
+    virtual std::vector<int> Index_DirichletNodes_Box
+            (double xl, double xh, double yl, double yh) const;
+
+    
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions.
+     * 
+     * All discretization nodes located at the surface 
+     * of the bounding box are [@p xl, @p xh]x[@p yl, @p yh]x[@p zl, @p zh]
+     * are defined as Dirichlet nodes.
+     * 
+     * @param[in] xl lower  value x-bounds
+     * @param[in] xh higher value x-bounds
+     * @param[in] yl lower  value y-bounds
+     * @param[in] yh higher value y-bounds
+     * @param[in] zl lower  value z-bounds
+     * @param[in] zh higher value z-bounds
+     * @return index vector.
+     */
+    [[nodiscard]] 
+    virtual std::vector<int> Index_DirichletNodes_Box
+            (double xl, double xh, double yl, double yh,double zl, double zh) const;
+
+
+    /**
+     * Exports the mesh information to ASCii files  @p basename + {_coords|_elements}.txt.
+     *
+     * The data are written in C indexing.
+     *
+     * @param[in] basename  first part of file names
+     */
+    void Export_scicomp(std::string const &basename) const;
+
+    /**
+     * Write vector @p v together with its mesh information to an ASCii file @p fname.
+     *
+     * The data are written in Matlab indexing.
+     *
+     * @param[in] fname  file name
+     * @param[in] v      vector
+     */
+    void Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const;
+
+    /**
+     * Visualize @p v together with its mesh information via matlab or octave.
+     *
+     * Comment/uncomment those code lines in method Mesh:Visualize (geom.cpp)
+     * that are supported on your system.
+     *
+     * @param[in] v      vector
+     *
+     * @warning matlab files ascii_read_meshvector.m  visualize_results.m
+     *          must be in the executing directory.
+     */
+    void Visualize_matlab(std::vector<double> const &v) const;
+    
+     /**
+     * Visualizse @p v together with its mesh information.
+     *
+     * Comment/uncomment those code lines in method Mesh:Visualize (geom.cpp)
+     * that are supported on your system.
+     *
+     * @param[in] v      vector
+     *
+     * @warning matlab files ascii_read_meshvector.m  visualize_results.m
+     *          must be in the executing directory.
+     */   
+    void Visualize(std::vector<double> const &v) const;
+
+    /**
+     * Write vector @p v together with its mesh information to an ASCii file @p fname.
+     *
+     * The data are written in C indexing for the VTK/paraview format.
+     *
+     * @param[in] fname  file name
+     * @param[in] v      vector
+     */
+    void Write_ascii_paraview(std::string const &fname, std::vector<double> const &v) const;
+private:    
+    void Write_ascii_paraview_2D(std::string const &fname, std::vector<double> const &v) const;
+    void Write_ascii_paraview_3D(std::string const &fname, std::vector<double> const &v) const;
+    
+public:
+     /**
+     * Visualize @p v together with its mesh information via paraview
+     *
+     * @param[in] v      vector
+     *
+     */   
+    void Visualize_paraview(std::vector<double> const &v) const;
+
+
+    /**
+     * Global number of edges.
+     * @return number of edges in mesh.
+     */
+    [[nodiscard]] int Nedges() const
+    {
+        return _nedge;
+    }
+
+    /**
+     * Global number of edges for each finite element.
+     * @return number of edges per element.
+     */
+    [[nodiscard]] int NedgesElements() const
+    {
+        return _nedge_e;
+    }
+
+    /**
+     * Read edge connectivity information (e1,e2,e3)_i.
+     * @return edge connectivity vector [nelems*_nedge_e].
+     */
+    [[nodiscard]] const std::vector<int>  &GetEdgeConnectivity() const
+    {
+        return _ea;
+    }
+
+    /**
+     * Access/Change edge connectivity information (e1,e2,e3)_i.
+     * @return edge connectivity vector [nelems*_nedge_e].
+     */
+    std::vector<int>  &GetEdgeConnectivity()
+    {
+        return _ea;
+    }
+
+    /**
+     * Read edge information (v1,v2)_i.
+     * @return edge connectivity vector [_nedge*2].
+     */
+    [[nodiscard]] const std::vector<int>  &GetEdges() const
+    {
+        return _edges;
+    }
+
+    /**
+     * Access/Change edge information (v1,v2)_i.
+     * @return edge connectivity vector [_nedge*2].
+     */
+    std::vector<int>  &GetEdges()
+    {
+        return _edges;
+    }
+
+    /**
+     * Determines all node to node connections from the vertex based mesh.
+      *
+     * @return vector[k][] containing all connections of vertex k, including to itself.
+     */
+    [[nodiscard]] std::vector<std::vector<int>> Node2NodeGraph() const
+    {
+        //// Check version 2 wrt. version 1
+        //auto v1=Node2NodeGraph_1();
+        //auto v2=Node2NodeGraph_2();
+        //if ( equal(v1.cbegin(),v1.cend(),v2.begin()) )
+        //{
+        //std::cout << "\nidentical Versions\n";
+        //}
+        //else
+        //{
+        //std::cout << "\nE R R O R   in Versions\n";
+        //}
+
+        //return Node2NodeGraph_1();
+        return Node2NodeGraph_2();        // 2 times faster than version 1
+    }
+
+    /**
+     * Accesses the father-of-nodes relation.
+      *
+     * @return  vector of length 0 because no relation available.
+      *
+     */
+    [[nodiscard]] virtual std::vector<int> const &GetFathersOfVertices() const
+    {
+        return _dummy;
+    }
+
+    /**
+     * Deletes all edge connectivity information (saves memory).
+     */
+    void Del_EdgeConnectivity();
+
+
+    /**
+      * All data containing vertex numbering are renumbered or sorted according to
+      * the permutation @p permut_old2new .
+      *
+      * @param[in] old2new      permutation of vertex indices: old2new[k] stores the new index of old index k
+      *
+     */
+    virtual void PermuteVertices(std::vector<int> const& old2new);
+    
+   /**
+     * Converts the (linear) P1 mesh into a (quadratic) P2 mesh.
+     */
+    void liftToQuadratic();    
+
+protected:
+    //public:
+    void SetNelem(int nelem)
+    {
+        _nelem = nelem;
+    }
+
+    void SetNverticesElement(int nvert)
+    {
+        _nvert_e = nvert;
+    }
+
+    void SetNdofsElement(int ndof)
+    {
+        _ndof_e = ndof;
+    }
+
+    void SetNnode(int nnode)
+    {
+        _nnode = nnode;
+    }
+
+    void SetNdim(int ndim)
+    {
+        _ndim = ndim;
+    }
+
+    void SetNedge(int nedge)
+    {
+        _nedge = nedge;
+    }
+
+    /**
+     * Reads vertex based mesh data from a binary file.
+     *
+     * File format, see ascii_write_mesh.m
+     *
+     * @param[in] fname file name
+    */
+    void ReadVectexBasedMesh(std::string const &fname);
+
+    /**
+     * The vertex based mesh data are used to derive the edge based data.
+     *
+     *  @warning Exactly 3 vertices, 3 edges per element are assumed (linear triangle in 2D)
+    */
+    void DeriveEdgeFromVertexBased()
+    {
+        //DeriveEdgeFromVertexBased_slow();
+        //DeriveEdgeFromVertexBased_fast();
+        if (2==Ndims())
+        {
+            DeriveEdgeFromVertexBased_fast_2();
+        }
+        else
+        {   // ToDo
+            std::cout << std::endl << "ToDo:  DeriveEdgeFromVertexBased for 3D!" << std::endl;
+        }
+    }
+    void DeriveEdgeFromVertexBased_slow();
+    void DeriveEdgeFromVertexBased_fast();
+    void DeriveEdgeFromVertexBased_fast_2();
+
+
+    /**
+     * The edge based mesh data are used to derive the vertex based data.
+      *
+      *  @warning Exactly 3 vertices, 3 edges per element are assumed (linear triangle in 2D)
+    */
+    void DeriveVertexFromEdgeBased();
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    [[nodiscard]] int Nnbedges() const
+    {
+        return static_cast<int>(_bedges.size());
+    }
+
+    /**
+     * Checks whether the array dimensions fit to their appropriate size parameters
+     * @return index vector.
+     */
+    [[nodiscard]] virtual bool Check_array_dimensions() const;
+
+    /**
+     * Permutes the vertex information in an edge based mesh.
+     *
+     * @param[in] old2new   new indices of original vertices.
+     */
+     
+    virtual void PermuteVertices_EdgeBased(std::vector<int> const &old2new);
+
+public:    
+    /**
+     * Check all elements for an inner angle > pi/2.
+     */    
+    [[nodiscard]] bool checkObtuseAngles() const;
+
+private:
+    /**
+     * Calculates the largest inner angle in element @p idx.
+     * 
+     * @param[in]   idx   number of element
+     * @return Angle in radiant.
+     */
+    [[nodiscard]] double largestAngle(int idx) const;
+    
+    /**
+     * Calculates the largest inner angle for all elements 
+     * and returns them in vector.
+     * 
+     * @return Vector with largest angle for each element..
+     */    
+    [[nodiscard]] std::vector<double> getLargestAngles() const;
+    
+    /**
+     * Determines all node to node connections from the vertex based mesh.
+     *
+     * @return vector[k][] containing all connections of vertex k, including to itself.
+     */
+    [[nodiscard]] std::vector<std::vector<int>> Node2NodeGraph_1() const;  // is correct
+
+    /**
+     * Determines all node to node connections from the vertex based mesh.
+      *
+      * Faster than @p Node2NodeGraph_1().
+      *
+     * @return vector[k][] containing all connections of vertex k, including to itself.
+     */
+    [[nodiscard]] std::vector<std::vector<int>> Node2NodeGraph_2() const;  // is correct
+
+    //private:
+protected:
+    int _nelem;         //!< number elements
+    int _nvert_e;       //!< number of geometric vertices per element
+    int _ndof_e;        //!< degrees of freedom (d.o.f.) per element
+    int _nnode;         //!< number nodes/vertices
+    int _ndim;          //!< space dimension of the problem (1, 2, or 3)
+    std::vector<int> _ia;    //!< element connectivity
+    std::vector<double> _xc; //!< coordinates
+
+protected:
+    // B.C.
+    std::vector<int> _bedges;     //!< boundary edges [nbedges][2] storing start/end vertex
+
+    //private:
+protected:
+    // edge based connectivity
+    int _nedge;              //!< number of edges in mesh
+    int _nedge_e;            //!< number of edges per element
+    std::vector<int> _edges; //!< edges of mesh (vertices ordered ascending)
+    std::vector<int> _ea;    //!< edge based element connectivity
+    // B.C.
+    std::vector<int> _ebedges; //!< boundary edges [nbedges]
+
+private:
+    const std::vector<int> _dummy; //!< empty dummy vector
+
+};
+
+
+/**
+ * Determines all node to node connections from the element connectivity @p ia.
+ * 
+ * @param[in] nelem   number of elements
+ * @param[in] ndof_e  degrees of freedom per element
+ * @param[in] ia      element connectivity [nelem*ndof_e]
+ * @return vector[k][] containing all connections of vertex k, including to itself. * name: unknown
+ * 
+ */
+std::vector<std::vector<int>> Node2NodeGraph(int nelem, int ndof_e,
+                                            std::vector<int> const &ia);
+
+
+/**
+ * Returns the vertex index of the arithmetic mean of vertices @p v1 and @p v2.
+ * 
+ * If that vertex is not already contained in the coordinate vector @p xc then 
+ * this new vertex is appended to @p xc.
+ * 
+ * @param[in]     v1    index of vertex
+ * @param[in]     v2    index of vertex
+ * @param[in,out] xc    coordinate vector [nnodes*ndim]
+ * @param[in]     ndim  space dimension
+ * @return vertex index of midpoint of vertices @p v1 and @p v2.
+ * 
+ */
+int appendMidpoint(int v1, int v2, std::vector<double> &xc, int ndim=3);
+
+/**
+ * Determines the index of a vertex @p xm in the coordinate vector @p xc.
+ * 
+ * @param[in]     xm    one vertex
+ * @param[in]     xc    vector of vertices [nnodes*ndim]
+ * @param[in]     ndim  space dimension
+ * @return index in vector or -1 in case the vertex is not contained in the vector.
+ * 
+ */
+int getVertexIndex(std::vector<double> const &xm, std::vector<double> const &xc, int ndim=3);
+
+
+/**
+ * Compares two floating point numbers with respect to a sloppy accuracy.
+ * 
+ * @param[in]     a  number
+ * @param[in]     b  number
+ * @param[in]   eps  accuracy
+ * @return result of @f$ |a-b| < \varepsilon @f$
+ * 
+ */
+inline
+bool equal(double a, double b, double eps=1e-6)
+{
+    return std::abs(b-a)<eps;
+}
+
+// *********************************************************************
+
+class RefinedMesh: public Mesh
+{
+public:
+    /**
+     * Constructs a refined mesh according to the marked elements in @p ibref.
+     *
+     * If the vector @p ibref has size 0 then all elements will be refined.
+     *
+     * @param[in] cmesh  original mesh for coarsening.
+     * @param[in] ibref  vector containing True/False regarding refinement for each element
+     *
+     */
+    //explicit RefinedMesh(Mesh const &cmesh, std::vector<bool> const &ibref = std::vector<bool>(0));
+    RefinedMesh(Mesh const &cmesh, std::vector<bool> ibref);
+    //RefinedMesh(Mesh const &cmesh, std::vector<bool> const &ibref);
+
+    /**
+    * Constructs a refined mesh by regulare refinement of all elements.
+    *
+    * @param[in] cmesh  original mesh for coarsening.
+    *
+    */
+    explicit RefinedMesh(Mesh const &cmesh)
+        : RefinedMesh(cmesh, std::vector<bool>(0))
+    {}
+
+
+    RefinedMesh(RefinedMesh const &) = delete;
+    //RefinedMesh(RefinedMesh const&&) = delete;
+
+    RefinedMesh &operator=(RefinedMesh const &) = delete;
+    //RefinedMesh& operator=(RefinedMesh const&&) = delete;
+
+    /**
+     * Destructor.
+     */
+    ~RefinedMesh() override;
+
+    /**
+     * Refines the mesh according to the marked elements.
+     *
+     * @param[in] ibref  vector containing True/False regarding refinement for each element
+     *
+     * @return the refined mesh
+     *
+     */
+    Mesh RefineElements(std::vector<bool> const &ibref);
+
+    /**
+     * Refines all elements in the actual mesh.
+     *
+     * @param[in] nref  number of regular refinements to perform
+     *
+     */
+    void RefineAllElements(int nref = 1);
+
+    /**
+     * Accesses the father-of-nodes relation.
+      *
+     * @return  father-of-nodes relation [nnodes][2]
+      *
+     */
+    [[nodiscard]] std::vector<int> const &GetFathersOfVertices() const override
+    {
+        return _vfathers;
+    }
+
+protected:
+    /**
+     * Checks whether the array dimensions fit to their appropriate size parameters
+     * @return index vector.
+     */
+    [[nodiscard]] bool Check_array_dimensions() const override;
+
+    /**
+     * Permutes the vertex information in an edge based mesh.
+      *
+     * @param[in] old2new   new indices of original vertices.
+     */
+    void PermuteVertices_EdgeBased(std::vector<int> const &old2new) override;
+
+
+private:
+    //Mesh const              & _cmesh; //!< coarse mesh
+    std::vector<bool> const   _ibref; //!< refinement info
+    int                       _nref;  //!< number of regular refinements performed
+    std::vector<int>          _vfathers; //!< stores the 2 fathers of each vertex (equal fathers denote original coarse vertex)
+
+};
+
+// *********************************************************************
+
+class gMesh_Hierarchy
+{
+public:
+    /**
+     * Constructs mesh hierarchy of @p nlevel levels starting with coarse mesh @p cmesh.
+      * The coarse mesh @p cmesh will be @p nlevel-1 times geometrically refined.
+      *
+      * @param[in] cmesh   initial coarse mesh
+     * @param[in] nlevel  number levels in mesh hierarchy
+      *
+     */
+    gMesh_Hierarchy(Mesh const &cmesh, int nlevel);
+
+    [[nodiscard]] size_t size() const
+    {
+        return _gmesh.size();
+    }
+
+    /**
+     * Access to mesh @p lev from mesh hierarchy.
+      *
+     * @return mesh @p lev
+      * @warning An out_of_range exception might be thrown.
+      *
+     */
+    Mesh const &operator[](int lev) const
+    {
+        return *_gmesh.at(lev);
+    }
+
+    /**
+     * Access to finest mesh in mesh hierarchy.
+      *
+     * @return finest mesh
+      *
+     */
+    [[nodiscard]] Mesh const &finest() const
+    {
+        return *_gmesh.back();
+    }
+
+    /**
+     * Access to coarest mesh in mesh hierarchy.
+      *
+     * @return coarsest mesh
+      *
+     */
+    [[nodiscard]] Mesh const &coarsest() const
+    {
+        return *_gmesh.front();
+    }
+
+private:
+    std::vector<std::shared_ptr<Mesh>> _gmesh; //!< mesh hierarchy from coarse ([0]) to fine.
+
+};
+
+
+
+// *********************************************************************
+/**
+ * 2D finite element mesh of the square consisting of linear triangular elements.
+ */
+class Mesh_2d_3_square: public Mesh
+{
+public:
+    /**
+     * Generates the f.e. mesh for the unit square.
+     *
+     * @param[in] nx    number of discretization intervals in x-direction
+     * @param[in] ny    number of discretization intervals in y-direction
+     * @param[in] myid  my MPI-rank / subdomain
+     * @param[in] procx number of ranks/subdomains in x-direction
+     * @param[in] procy number of processes in y-direction
+    */
+    Mesh_2d_3_square(int nx, int ny, int myid = 0, int procx = 1, int procy = 1);
+
+    /**
+     * Destructor
+     */
+    ~Mesh_2d_3_square() override;
+
+    /**
+     * Set solution vector based on a tensor product grid in the rectangle.
+     * @param[in] u solution vector
+     */
+    void SetU(std::vector<double> &u) const;
+
+    /**
+     * Set right hand side (rhs) vector on a tensor product grid in the rectangle.
+     * @param[in] f rhs vector
+     */
+    void SetF(std::vector<double> &f) const;
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    [[nodiscard]] std::vector<int> Index_DirichletNodes() const override;
+
+    /**
+      * Stores the values of vector @p u of (sub)domain into a file @p name for further processing in gnuplot.
+      * The file stores rowise the x- and y- coordinates together with the value from  @p u .
+      * The domain [@p xl, @p xr] x [@p yb, @p yt] is discretized into @p nx x @p ny intervals.
+      *
+      * @param[in] name  basename of file name (file name will be extended by the rank number)
+      * @param[in] u     local vector
+      *
+      * @warning   Assumes tensor product grid in unit square; rowise numbered
+      *            (as generated in class constructor).
+      *            The output is provided for tensor product grid visualization
+      *            ( similar to Matlab-surf() ).
+      *
+      * @see Mesh_2d_3_square
+      */
+    void SaveVectorP(std::string const &name, std::vector<double> const &u) const;
+
+    // here will still need to implement in the class
+    //  GetBound(), AddBound()
+    //  or better a generalized way with indices and their appropriate ranks for MPI communication
+
+private:
+    /**
+      * Determines the coordinates of the discretization nodes of the domain [@p xl, @p xr] x [@p yb, @p yt]
+      * which is discretized into @p nx x @p ny intervals.
+      * @param[in] nx    number of discretization intervals in x-direction
+      * @param[in] ny    number of discretization intervals in y-direction
+      * @param[in] xl    x-coordinate of left boundary
+      * @param[in] xr    x-coordinate of right boundary
+      * @param[in] yb    y-coordinate of lower boundary
+      * @param[in] yt    y-coordinate of upper boundary
+      * @param[out] xc   coordinate vector of length 2n with x(2*k,2*k+1) as coordinates of node k
+      */
+
+    static void GetCoordsInRectangle(int nx, int ny, double xl, double xr, double yb, double yt,
+                              double xc[]);
+    /**
+      * Determines the element connectivity of linear triangular elements of a FEM discretization
+      * of a rectangle using @p nx x @p ny equidistant intervals for discretization.
+      * @param[in] nx    number of discretization intervals in x-direction
+      * @param[in] ny    number of discretization intervals in y-direction
+      * @param[out] ia   element connectivity matrix with ia(3*s,3*s+1,3*s+2) as node numbers od element s
+      */
+    static void GetConnectivityInRectangle(int nx, int ny, int ia[]);
+
+private:
+    int _myid;          //!< my MPI rank
+    int _procx;         //!< number of MPI ranks in x-direction
+    int _procy;         //!< number of MPI ranks in y-direction
+    std::array<int, 4> _neigh; //!< MPI ranks of neighbors (negative: no neighbor but b.c.)
+    int _color;         //!< red/black coloring (checker board) of subdomains
+
+    double _xl;         //!< x coordinate of lower left  corner of square
+    double _xr;         //!< x coordinate of lower right corner of square
+    double _yb;         //!< y coordinate or lower left  corner of square
+    double _yt;         //!< y coordinate of upper right corner of square
+    int    _nx;         //!< number of intervals in x-direction
+    int    _ny;         //!< number of intervals in y-direction
+};
+
+// *********************************************************************
--- a/mgrid_2/getmatrix.cpp
+++ b/mgrid_2/getmatrix.cpp
@ -0,0 +1,946 @@
+#include "binaryIO.h"
+#include "getmatrix.h"
+#include "userset.h"
+#include "utils.h"
+
+#include "omp.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <ctime>                  // contains clock()
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <string>
+#include <utility>
+#include <vector>
+using namespace std;
+
+// ####################################################################
+
+Matrix::Matrix(int const nrows, int const ncols)
+    : _nrows(nrows), _ncols(ncols), _dd(0)
+{}
+
+
+Matrix::~Matrix()
+{}
+
+vector<double> const & Matrix::GetDiag() const
+{
+   //bool ddEmpty;
+   ////#pragma omp critical
+   //ddEmpty= (_dd.empty());     // local variable!
+  
+  // GH: Move allocation etc. to constructor !?
+   if ( _dd.empty() )         
+   {
+       //#pragma omp single
+       //std::cout << "PPPPPPPPPPPPPPPPPPPP\n";               
+       #pragma omp barrier
+       #pragma omp single
+       _dd.resize(Nrows());
+       //#pragma omp barrier
+       this->GetDiag(_dd);
+
+    }
+    assert( Nrows()==static_cast<int>(_dd.size()) );
+    //#pragma omp master
+    //std::cout << ".";
+    return _dd;
+}
+
+// ####################################################################
+
+CRS_Matrix::CRS_Matrix()
+    : Matrix(0, 0), _nnz(0), _id(0), _ik(0), _sk(0)
+{}
+
+CRS_Matrix::CRS_Matrix(const std::string &file) : Matrix(0, 0), _nnz(0), _id(0), _ik(0), _sk(0)
+{
+    readBinary(file);
+    _nrows = static_cast<int>(size(_id) - 1);
+    _ncols = _nrows;
+}
+
+
+CRS_Matrix::~CRS_Matrix()
+{}
+
+void CRS_Matrix::Mult(vector<double> &w, vector<double> const &u) const
+{
+    assert( _ncols == static_cast<int>(u.size()) ); // compatibility of inner dimensions
+    assert( _nrows == static_cast<int>(w.size()) ); // compatibility of outer dimensions
+
+    #pragma omp parallel for
+    for (int row = 0; row < _nrows; ++row) {
+        double wi = 0.0;
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij) {
+            wi += _sk[ij] * u[ _ik[ij] ];
+        }
+        w[row] = wi;
+    }
+    return;
+}
+
+void CRS_Matrix::Defect(vector<double> &w,
+                        vector<double> const &f, vector<double> const &u) const
+{
+    assert( _ncols == static_cast<int>(u.size()) ); // compatibility of inner dimensions
+    assert( _nrows == static_cast<int>(w.size()) ); // compatibility of outer dimensions
+    assert( w.size() == f.size() );
+
+    //#pragma omp parallel for
+    #pragma omp for
+    for (int row = 0; row < _nrows; ++row) {
+        double wi = f[row];
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij) {
+            wi -= _sk[ij] * u[ _ik[ij] ];
+        }
+        w[row] = wi;
+    }
+    return;
+}
+
+
+void CRS_Matrix::JacobiSmoother(std::vector<double> const &f, std::vector<double> &u,
+                    std::vector<double> &r, int nsmooth, double const omega, bool zero) const
+{
+    // ToDO: ensure compatible dimensions
+    //#pragma omp master
+    //cout << "Jac in\n"; 
+    assert(_ncols==_nrows);
+    assert( _ncols == static_cast<int>(u.size()) ); // compatibility of inner dimensions
+    assert( _nrows == static_cast<int>(r.size()) ); // compatibility of outer dimensions
+    assert( r.size() == f.size() );
+    
+    auto const &D = Matrix::GetDiag();        // accumulated diagonal of matrix @p SK.
+    //#pragma omp barrier
+    //#pragma omp master
+    //cout << "Matrix::GetDiag finished\n";
+         
+    if (zero) {            // assumes initial solution is zero
+        #pragma omp for    
+        for (int k = 0; k < _nrows; ++k) {
+            // u := u + om*D^{-1}*f
+            u[k] = omega*f[k] / D[k]; // MPI: distributed to accumulated vector needed
+        }
+        //#pragma omp single
+        --nsmooth;                           // first smoothing sweep done
+    }
+    //cout << zero << endl;
+    //cout << nsmooth << endl;
+ 
+    for (int ns = 1; ns <= nsmooth; ++ns) {
+        //Defect(r, f, u);                  //  r := f - K*u    
+        #pragma omp  for
+        for (int row = 0; row < _nrows; ++row) {
+            double wi = f[row];
+            for (int ij = _id[row]; ij < _id[row + 1]; ++ij) {
+                wi -= _sk[ij] * u[ _ik[ij] ];
+            }
+            r[row] = wi;
+        }        
+        #pragma omp for    
+        for (int k = 0; k < _nrows; ++k) {
+            // u := u + om*D^{-1}*r
+            u[k] = u[k] + omega * r[k] / D[k]; // MPI: distributed to accumulated vector needed
+        }
+    }
+    //#pragma omp master
+    //cout << "Jac out\n"; 
+    return;
+}
+
+void CRS_Matrix::GetDiag(vector<double> &d) const
+{
+    // be carefull when using a rectangular matrix
+    int const nm = min(_nrows, _ncols);
+
+    assert( nm == static_cast<int>(d.size()) ); // instead of stopping we could resize d and warn the user
+
+    //#pragma omp parallel for
+    #pragma omp for
+    for (int row = 0; row < nm; ++row) {
+        const int ia = fetch(row, row); // Find diagonal entry of row
+        assert(ia >= 0);
+        d[row] = _sk[ia];
+    }
+    cout << ">>>>> CRS_Matrix::GetDiag  <<<<<" << endl;
+    return;
+}
+
+inline
+int CRS_Matrix::fetch(int const row, int const col) const
+{
+    int const id2 = _id[row + 1];    // end   and
+    int       ip  = _id[row];        // start of recent row (global index)
+
+    while (ip < id2 && _ik[ip] != col) { // find index col (global index)
+        ++ip;
+    }
+    if (ip >= id2) {
+        ip = -1;
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+        cout << "No column  " << col << "  in row  " << row << endl;
+        assert(ip >= id2);
+#endif
+    }
+    return ip;
+}
+
+void CRS_Matrix::Debug() const
+{
+//  ID points to first entry of row
+//  no symmetry assumed
+    cout << "\nMatrix  (" << _nrows << " x " << _ncols << "  with  nnz = " << _id[_nrows] << ")\n";
+
+    for (int row = 0; row < _nrows; ++row) {
+        cout << "Row " << row << " : ";
+        int const id1 = _id[row];
+        int const id2 = _id[row + 1];
+        for (int j = id1; j < id2; ++j) {
+            cout.setf(ios::right, ios::adjustfield);
+            cout << "[" << setw(2) << _ik[j] << "]  " << setw(4) << _sk[j] << "  ";
+        }
+        cout << endl;
+    }
+    return;
+}
+
+
+
+bool CRS_Matrix::Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const
+{
+    bool bn = (nnode == _nrows);     // number of rows
+    if (!bn) {
+        cout << "#########   Error: " << "number of rows" << endl;
+    }
+
+    bool bz = (id[nnode] == _nnz);   // number of non zero elements
+    if (!bz) {
+        cout << "#########   Error: " << "number of non zero elements" << endl;
+    }
+
+    bool bd = equal(id, id + nnode + 1, _id.cbegin()); // row starts
+    if (!bd) {
+        cout << "#########   Error: " << "row starts" << endl;
+    }
+
+    bool bk = equal(ik, ik + id[nnode], _ik.cbegin()); // column indices
+    if (!bk) {
+        cout << "#########   Error: " << "column indices" << endl;
+    }
+
+    bool bv = equal(sk, sk + id[nnode], _sk.cbegin()); // values
+    if (!bv) {
+        cout << "#########   Error: " << "values" << endl;
+    }
+
+    return bn && bz && bd && bk && bv;
+}
+
+
+void CRS_Matrix::writeBinary(const std::string &file)
+{
+    vector<int> cnt(size(_id) - 1);
+    for (size_t k = 0; k < size(cnt); ++k) {
+        cnt[k] = _id[k + 1] - _id[k];
+    }
+    //adjacent_difference( cbegin(_id)+1, cend(_id), cnt );
+    write_binMatrix(file, cnt, _ik, _sk);
+}
+
+void CRS_Matrix::readBinary(const std::string &file)
+{
+    vector<int> cnt;
+    read_binMatrix(file, cnt, _ik, _sk);
+    _id.resize(size(cnt) + 1);
+    _id[0] = 0;
+    for (size_t k = 0; k < size(cnt); ++k) {
+        _id[k + 1] = _id[k] + cnt[k];
+    }
+    //partial_sum( cbegin(cnt), cend(cnt), begin(_id)+1 );
+}
+
+// ####################################################################
+
+FEM_Matrix::FEM_Matrix(Mesh const &mesh)
+    : CRS_Matrix(), _mesh(mesh)
+{
+    Derive_Matrix_Pattern();
+    return;
+}
+
+FEM_Matrix::~FEM_Matrix()
+{}
+
+void FEM_Matrix::Derive_Matrix_Pattern_fast()
+{
+    cout << "\n############   FEM_Matrix::Derive_Matrix_Pattern ";
+    MyTimer tstart;    //tstart.tic();
+    
+    int const nelem(_mesh.Nelems());
+    int const ndof_e(_mesh.NdofsElement());
+    auto const &ia(_mesh.GetConnectivity());
+//  Determine the number of matrix rows
+    _nrows = *max_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem);
+    ++_nrows;                                 // node numberng: 0 ... nnode-1
+    assert(*min_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem) == 0); // numbering starts with 0 ?
+
+// CSR data allocation
+    _id.resize(_nrows + 1);                  // Allocate memory for CSR row pointer
+//##########################################################################
+    auto const v2v = _mesh.Node2NodeGraph();
+    _nnz = 0;                         // number of connections
+    _id[0] = 0;                       // start of matrix row zero
+    for (size_t v = 0; v < v2v.size(); ++v ) {
+        _id[v + 1] = _id[v] + v2v[v].size();
+        _nnz += v2v[v].size();
+    }
+    assert(_nnz == _id[_nrows]);
+    _sk.resize(_nnz);                        // Allocate memory for CSR column index vector
+
+// CSR data allocation
+    _ik.resize(_nnz);                        // Allocate memory for CSR column index vector
+// Copy column indices
+    int kk = 0;
+    for (const auto & v : v2v) {
+        for (size_t vi = 0; vi < v.size(); ++vi) {
+            _ik[kk] = v[vi];
+            ++kk;
+        }
+    }
+    _ncols = *max_element(_ik.cbegin(), _ik.cend());  // maximal column number
+    ++_ncols;                                         // node numbering: 0 ... nnode-1
+    //cout << _nrows << "  " << _ncols << endl;
+    assert(_ncols == _nrows);
+
+    cout << "finished in  " <<  tstart.toc()  << " sec.    ########\n";
+  
+
+    return;
+}
+
+
+void FEM_Matrix::Derive_Matrix_Pattern_slow()
+{
+    cout << "\n############   FEM_Matrix::Derive_Matrix_Pattern slow ";
+    auto tstart = clock();
+    int const nelem(_mesh.Nelems());
+    int const ndof_e(_mesh.NdofsElement());
+    auto const &ia(_mesh.GetConnectivity());
+//  Determine the number of matrix rows
+    _nrows = *max_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem);
+    ++_nrows;                                 // node numberng: 0 ... nnode-1
+    assert(*min_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem) == 0); // numbering starts with 0 ?
+
+//  Collect for each node those nodes it is connected to (multiple entries)
+//  Detect the neighboring nodes
+    vector< list<int> > cc(_nrows);             //  cc[i] is the  list of nodes a node 'i' is connected to
+    for (int i = 0; i < nelem; ++i) {
+        int const idx = ndof_e * i;
+        for (int k = 0; k < ndof_e; ++k) {
+            list<int> &cck = cc[ia[idx + k]];
+            cck.insert( cck.end(), ia.cbegin() + idx, ia.cbegin() + idx + ndof_e );
+        }
+    }
+//  Delete the multiple entries
+    _nnz = 0;
+    for (auto &it : cc) {
+        it.sort();
+        it.unique();
+        _nnz += it.size();
+        // cout << it.size() << " :: "; copy(it->begin(),it->end(), ostream_iterator<int,char>(cout,"  ")); cout << endl;
+    }
+
+// CSR data allocation
+    _id.resize(_nrows + 1);                  // Allocate memory for CSR row pointer
+    _ik.resize(_nnz);                        // Allocate memory for CSR column index vector
+
+//  copy CSR data
+    _id[0] = 0;                              // begin of first row
+    for (size_t i = 0; i < cc.size(); ++i) {
+        //cout << i << "   " << nid.at(i) << endl;;
+        const list<int> &ci = cc[i];
+        const auto nci = static_cast<int>(ci.size());
+        _id[i + 1] = _id[i] + nci; // begin of next line
+        copy(ci.begin(), ci.end(), _ik.begin() + _id[i] );
+    }
+
+    assert(_nnz == _id[_nrows]);
+    _sk.resize(_nnz);                        // Allocate memory for CSR column index vector
+
+    _ncols = *max_element(_ik.cbegin(), _ik.cend());  // maximal column number
+    ++_ncols;                                 // node numbering: 0 ... nnode-1
+    //cout << _nrows << "  " << _ncols << endl;
+    assert(_ncols == _nrows);
+
+    double duration = static_cast<double>(clock() - tstart) / CLOCKS_PER_SEC;  // ToDo: change to  systemclock
+    cout << "finished in  " <<  duration  << " sec.    ########\n";
+
+    return;
+}
+
+
+void FEM_Matrix::CalculateLaplace(vector<double> &f)
+{
+    cout << "\n############   FEM_Matrix::CalculateLaplace ";
+    //double tstart = clock();
+    double tstart = omp_get_wtime();                  // OpenMP
+    assert(_mesh.NdofsElement() == 3);               // only for triangular, linear elements
+    //cout << _nnz << " vs. " << _id[_nrows] << "  " << _nrows<< endl;
+    assert(_nnz == _id[_nrows]);
+
+    for (int k = 0; k < _nrows; ++k) {
+        _sk[k] = 0.0;
+    }
+    for (int k = 0; k < _nrows; ++k) {
+        f[k] = 0.0;
+    }
+
+    double ske[3][3], fe[3];
+    //  Loop over all elements
+    auto const nelem = _mesh.Nelems();
+    auto const &ia   = _mesh.GetConnectivity();
+    auto const &xc   = _mesh.GetCoords();
+
+    #pragma omp parallel for private(ske,fe)
+    for (int i = 0; i < nelem; ++i) {
+        CalcElem(ia.data() + 3 * i, xc.data(), ske, fe);
+        //AddElem(ia.data()+3 * i, ske, fe, _id.data(), _ik.data(), _sk.data(), f.data()); // GH: deprecated
+        AddElem_3(ia.data() + 3 * i, ske, fe, f);
+    }
+
+    //double duration = (clock() - tstart) / CLOCKS_PER_SEC;
+    double duration = omp_get_wtime() - tstart;             // OpenMP
+    cout << "finished in  " <<  duration  << " sec.    ########\n";  // ToDo: change to  systemclock
+    //Debug();
+
+    return;
+}
+
+void FEM_Matrix::CalculateRHS(vector<double> &f, const std::function<double(double,double)> &func)
+{
+    cout << "\n############   FEM_Matrix::CalculateRHS ";
+    //double tstart = clock();
+    double tstart = omp_get_wtime();                  // OpenMP
+    assert(_mesh.NdofsElement() == 3);               // only for triangular, linear elements
+    //cout << _nnz << " vs. " << _id[_nrows] << "  " << _nrows<< endl;
+    assert(_nnz == _id[_nrows]);
+
+    for (int k = 0; k < _nrows; ++k) {
+        f[k] = 0.0;
+    }
+
+    double fe[3];
+    //  Loop over all elements
+    auto const nelem = _mesh.Nelems();
+    auto const &ia   = _mesh.GetConnectivity();
+    auto const &xc   = _mesh.GetCoords();
+
+    #pragma omp parallel for private(fe)
+    for (int i = 0; i < nelem; ++i) {
+        CalcElem_RHS(ia.data() + 3 * i, xc.data(), fe, func);
+        AddElemRHS_3(ia.data() + 3 * i, fe, f);
+    }
+
+    //double duration = (clock() - tstart) / CLOCKS_PER_SEC;
+    double duration = omp_get_wtime() - tstart;             // OpenMP
+    cout << "finished in  " <<  duration  << " sec.    ########\n";  // ToDo: change to  systemclock
+    //Debug();
+
+    return;
+}
+
+
+void FEM_Matrix::ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f)
+{
+    auto const idx = _mesh.Index_DirichletNodes();
+    int const nidx = idx.size();
+
+    for (int i = 0; i < nidx; ++i) {
+        int const row = idx[i];
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij) {
+            int const col = _ik[ij];
+            if (col == row) {
+                _sk[ij] = 1.0;
+                f[row]  = u[row];
+            }
+            else {
+                int const id1 = fetch(col, row); // Find entry (col,row)
+                assert(id1 >= 0);
+                f[col] -= _sk[id1] * u[row];
+                _sk[id1] = 0.0;
+                _sk[ij]  = 0.0;
+            }
+        }
+    }
+
+    return;
+}
+
+
+
+void FEM_Matrix::AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], vector<double> &f)
+{
+    for (int i = 0; i < 3; ++i) {
+        const int ii  = ial[i];           // row ii (global index)
+        for (int j = 0; j < 3; ++j) {     // no symmetry assumed
+            const int jj = ial[j];        // column jj (global index)
+            const int ip = fetch(ii, jj);       // find column entry jj in row ii
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+            if (ip < 0) {      // no entry found !!
+                cout << "Error in AddElem: (" << ii << "," << jj << ") ["
+                     << ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
+                assert(ip >= 0);
+            }
+#endif
+            #pragma omp atomic
+            _sk[ip] += ske[i][j];
+        }
+        #pragma omp atomic
+        f[ii] += fe[i];
+    }
+}
+
+void FEM_Matrix::AddElemRHS_3(int const ial[3], double const fe[3], vector<double> &f)
+{
+    for (int i = 0; i < 3; ++i) {
+        const int ii  = ial[i];           // row ii (global index)
+        #pragma omp atomic
+        f[ii] += fe[i];
+    }
+}
+
+bool CRS_Matrix::CheckSymmetry() const
+{
+    cout << "+++  Check matrix symmetry  +++" << endl;
+    bool bs{true};
+    #pragma omp parallel for reduction(&&:bs)
+    for (int row = 0; row < Nrows(); ++row) {
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij) {
+            const int col = _ik[ij];       // column col (global index)
+            const int ip = fetch(col, row);  // find column entry row in row col
+            if (ip < 0) {      // no entry found !!
+                cout << "Matrix has non-symmetric pattern at (" << row << "," << col << ")" << endl;
+                bs = false;
+                //assert(ip >= 0);
+            }
+            if ( std::abs(_sk[ij] - _sk[ip]) > 1e-13) {
+                cout << "Matrix has non-symmetric entries at (" << row << "," << col << ")" << endl;
+                bs = false;
+            }
+        }
+    }
+    return bs;
+}
+
+
+bool CRS_Matrix::CheckRowSum() const
+{
+    cout << "+++  Check row sum  +++" << endl;
+    vector<double> rhs(Ncols(), 1.0);
+    vector<double> res(Nrows());
+
+    Mult(res, rhs);
+
+    bool bb{true};
+    #pragma omp parallel for reduction(&&:bb)
+    for (size_t k = 0; k < res.size(); ++k) {
+        //if (std::abs(res[k]) != 0.0)
+        if (std::abs(res[k]) > 1e-14) {
+            cout << "!! Nonzero row " << k << " : sum = " << res[k] << endl;
+            bb = false;
+        }
+    }
+    return bb;
+}
+
+bool CRS_Matrix::CheckMproperty() const
+{
+    cout << "+++  Check M property  +++" << endl;
+    bool bm{true};
+    //#pragma omp parallel for reduction(&&:bm)
+    for (int row = 0; row < Nrows(); ++row) {
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij) {
+            bool b_diag{true}, b_off{true};
+            if (_ik[ij] == row) {
+                b_diag = _sk[ij] > 0.0;
+                if (!b_diag) {
+                    cout << "## negative diag in row " << row << " : " << _sk[ij] << endl;
+                    bm = false;
+                }
+            }
+            else {
+                b_off = _sk[ij] <= 0.0;
+                if (!b_off) {
+                    //cout << "!! positive off-diag [" << row << "," << _ik[ij] << "] : " << _sk[ij] << endl;
+                    bm = false;
+                }
+            }
+        }
+    }
+    return bm;
+}
+
+bool CRS_Matrix::ForceMproperty()
+{
+    cout << "+++  Force M property  +++" << endl;
+    bool bm{false};
+    #pragma omp parallel for reduction(&&:bm)
+    for (int row = 0; row < Nrows(); ++row) {
+        double corr{0.0};
+        int idiag = {-1};
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij) {
+            if (_ik[ij] != row &&  _sk[ij] > 0.0) {
+                corr   += _sk[ij];
+                _sk[ij] = 0.0;
+                bm = true;
+            }
+            if (_ik[ij] == row) {
+                idiag = ij;
+            }
+        }
+        assert(idiag >= 0);
+        _sk[idiag] += corr;
+    }
+    return bm;
+}
+
+bool CRS_Matrix::CheckMatrix() const
+{
+    bool b0 = CheckSymmetry();
+    if (!b0) {
+        cout << " !!!!  N O   S Y M M E T R Y" << endl;
+    }
+    
+    bool b1 = CheckRowSum();
+    if (!b1) {
+        cout << " !!!!  R O W   S U M   E R R O R" << endl;
+    }
+
+    bool b2 = CheckMproperty();
+    if (!b2) {
+        cout << " !!!!  N O   M - M A T R I X" << endl;
+    }
+
+    return b1 && b2;
+}
+
+void CRS_Matrix::GetDiag_M(vector<double> &d) const
+{
+    // be carefull when using a rectangular matrix
+    //int const nm = min(_nrows, _ncols);
+    #pragma omp single
+    assert( min(_nrows, _ncols) == static_cast<int>(d.size()) ); // instead of stopping we could resize d and warn the user
+
+    #pragma omp for
+    for (int row = 0; row < Nrows(); ++row) {
+        d[row] = 0.0;
+        double v_ii{-1.0};
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij) {
+            if (_ik[ij] != row) {
+                d[row] += std::abs(_sk[ij]);
+            }
+            else {
+                v_ii = _sk[ij];
+            }
+        }
+        if ( d[row] < v_ii ) {
+            d[row] = v_ii;
+        }
+    }
+    #pragma omp master
+    cout << "<<<<<<<  GetDiag_M (finished)   >>>>>>>>>" << endl;
+    return;
+}
+
+
+//  general routine for lin. triangular elements
+
+void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3])
+//void CalcElem(const int* __restrict__ ial, const double* __restrict__ xc, double* __restrict__ ske[3], double* __restrict__ fe)
+{
+    const int  i1  = 2 * ial[0],   i2 = 2 * ial[1],   i3 = 2 * ial[2];
+    const double x13 = xc[i3 + 0] - xc[i1 + 0],  y13 = xc[i3 + 1] - xc[i1 + 1],
+                 x21 = xc[i1 + 0] - xc[i2 + 0],  y21 = xc[i1 + 1] - xc[i2 + 1],
+                 x32 = xc[i2 + 0] - xc[i3 + 0],  y32 = xc[i2 + 1] - xc[i3 + 1];
+    const double jac = fabs(x21 * y13 - x13 * y21);
+
+    ske[0][0] = 0.5 / jac * (y32 * y32 + x32 * x32);
+    ske[0][1] = 0.5 / jac * (y13 * y32 + x13 * x32);
+    ske[0][2] = 0.5 / jac * (y21 * y32 + x21 * x32);
+    ske[1][0] = ske[0][1];
+    ske[1][1] = 0.5 / jac * (y13 * y13 + x13 * x13);
+    ske[1][2] = 0.5 / jac * (y21 * y13 + x21 * x13);
+    ske[2][0] = ske[0][2];
+    ske[2][1] = ske[1][2];
+    ske[2][2] = 0.5 / jac * (y21 * y21 + x21 * x21);
+
+    const double xm    = (xc[i1 + 0] + xc[i2 + 0] + xc[i3 + 0]) / 3.0,
+                 ym    = (xc[i1 + 1] + xc[i2 + 1] + xc[i3 + 1]) / 3.0;
+    //fe[0] = fe[1] = fe[2] = 0.5 * jac * FunctF(xm, ym) / 3.0;
+    fe[0] = fe[1] = fe[2] = 0.5 * jac * fNice(xm, ym) / 3.0;
+}
+
+
+void CalcElem_RHS(int const ial[3], double const xc[], double fe[3],
+             const std::function<double(double,double)> &func)
+{
+    const int  i1  = 2 * ial[0],   i2 = 2 * ial[1],   i3 = 2 * ial[2];
+    const double x13 = xc[i3 + 0] - xc[i1 + 0],  y13 = xc[i3 + 1] - xc[i1 + 1],
+                 x21 = xc[i1 + 0] - xc[i2 + 0],  y21 = xc[i1 + 1] - xc[i2 + 1];
+                 //x32 = xc[i2 + 0] - xc[i3 + 0],  y32 = xc[i2 + 1] - xc[i3 + 1];
+    const double jac = fabs(x21 * y13 - x13 * y21);
+
+    const double xm    = (xc[i1 + 0] + xc[i2 + 0] + xc[i3 + 0]) / 3.0,
+                 ym    = (xc[i1 + 1] + xc[i2 + 1] + xc[i3 + 1]) / 3.0;
+    fe[0] = fe[1] = fe[2] = 0.5 * jac * func(xm, ym) / 3.0;
+}
+
+void CalcElem_Masse(int const ial[3], double const xc[], double ske[3][3])
+{
+    const int  i1  = 2 * ial[0],   i2 = 2 * ial[1],   i3 = 2 * ial[2];
+    const double x13 = xc[i3 + 0] - xc[i1 + 0],  y13 = xc[i3 + 1] - xc[i1 + 1],
+                 x21 = xc[i1 + 0] - xc[i2 + 0],  y21 = xc[i1 + 1] - xc[i2 + 1];
+                 //x32 = xc[i2 + 0] - xc[i3 + 0],  y32 = xc[i2 + 1] - xc[i3 + 1];
+    const double jac = fabs(x21 * y13 - x13 * y21);
+
+    ske[0][0] += jac / 12.0;
+    ske[0][1] += jac / 24.0;
+    ske[0][2] += jac / 24.0;
+    ske[1][0] += jac / 24.0;
+    ske[1][1] += jac / 12.0;
+    ske[1][2] += jac / 24.0;
+    ske[2][0] += jac / 24.0;
+    ske[2][1] += jac / 24.0;
+    ske[2][2] += jac / 12.0;
+
+    return;
+}
+
+// #####################################################################
+
+BisectInterpolation::BisectInterpolation()
+    : Matrix( 0, 0 ), _iv(), _vv()
+{
+}
+
+BisectInterpolation::BisectInterpolation(std::vector<int> const &fathers)
+    : Matrix( static_cast<int>(fathers.size()) / 2, 1 + * max_element(fathers.cbegin(), fathers.cend()) ),
+      _iv(fathers), _vv(fathers.size(), 0.5)
+{
+}
+
+BisectInterpolation::~BisectInterpolation()
+{}
+
+void BisectInterpolation::GetDiag(vector<double> &d) const
+{
+    assert( Nrows() == static_cast<int>(d.size()) );
+
+    for (int k = 0; k < Nrows(); ++k) {
+        if ( _iv[2 * k] == _iv[2 * k + 1] ) {
+            d[k] = 1.0;
+        }
+        else {
+            d[k] = 0.0;
+        }
+    }
+    return;
+}
+
+void BisectInterpolation::Mult(vector<double> &wf, vector<double> const &uc) const
+{
+    assert( Nrows() == static_cast<int>(wf.size()) );
+    assert( Ncols() == static_cast<int>(uc.size()) );
+
+    //#pragma omp parallel for
+    #pragma omp for
+    for (int k = 0; k < Nrows(); ++k) {
+        wf[k] = _vv[2 * k] * uc[_iv[2 * k]] + _vv[2 * k + 1] * uc[_iv[2 * k + 1]];
+    }
+    return;
+}
+
+void BisectInterpolation::MultT(vector<double> const &wf, vector<double> &uc) const
+{
+    assert(  Nrows() == static_cast<int>( wf.size()) );
+    assert(  Ncols() == static_cast<int>( uc.size()) );
+    assert(2*Nrows() == static_cast<int>(_iv.size()) );
+    assert(2*Nrows() == static_cast<int>(_vv.size()) );
+               //#pragma omp single
+            //cout << "xxx\n";  
+    
+    #pragma omp for
+    for (int k = 0; k < Ncols(); ++k)  uc[k] = 0.0;
+             //#pragma  omp single
+            //cout << "yyy\n";     
+   
+// GH: atomic slows down the code ==> use different storage for MultT operation (CRS-matrix?)
+    //#pragma omp parallel for
+    #pragma omp for
+    for (int k = 0; k < Nrows(); ++k) {
+        int const j1=_iv[2 * k    ];
+        int const j2=_iv[2 * k + 1];
+            //#pragma omp critical
+            //cout << uc.size() << "  " << j1 << "  " << j2 << "\n";  
+        #pragma omp atomic
+        uc[j1] += _vv[2 * k  ] * wf[k];
+        //#pragma omp critical
+            //cout << " aa\n";  
+        #pragma omp atomic
+        uc[j2] += _vv[2 * k + 1] * wf[k];
+    }
+            //#pragma  omp single
+            //cout << "zzz\n";  
+    return;
+}
+
+
+void BisectInterpolation::MultT_Full(vector<double> const &wf, vector<double> &uc) const
+{
+    assert( Nrows() == static_cast<int>(wf.size()) );
+    assert( Ncols() == static_cast<int>(uc.size()) );
+// GH: atomic slows down the code ==> use different storage for MultT operation (CRS-matrix?)
+////#pragma omp parallel for
+    for (int k = 0; k < Ncols(); ++k)  uc[k] = 0.0;
+    vector<double> full(uc.size(),0.0);
+    //#pragma omp parallel for
+    for (int k = 0; k < Nrows(); ++k) {
+        if (_iv[2 * k] != _iv[2 * k + 1]) {
+            //#pragma omp atomic
+            uc[_iv[2 * k]  ] += _vv[2 * k  ] * wf[k];
+            //#pragma omp atomic
+            uc[_iv[2 * k + 1]] += _vv[2 * k + 1] * wf[k];
+            full[_iv[2 * k    ]] += _vv[2 * k  ];
+            full[_iv[2 * k + 1]] += _vv[2 * k + 1];
+        }
+        else {
+            //#pragma omp atomic
+            uc[_iv[2 * k]  ] +=  2.0*_vv[2 * k  ] * wf[k]; // uses a property of class BisectInterpolation
+            //uc[_iv[2 * k]  ] +=  _vv[2 * k  ] * wf[k]; // uses a property of class BisectInterpolation
+            full[_iv[2 * k] ] += 2.0*_vv[2 * k  ];
+        }
+    }
+    for (size_t k=0; k<uc.size(); ++k)  uc[k] /= full[k];
+    return;
+}
+
+void BisectInterpolation::Defect(vector<double> &w,
+                                 vector<double> const &f, vector<double> const &u) const
+{
+    assert( Nrows() == static_cast<int>(w.size()) );
+    assert( Ncols() == static_cast<int>(u.size()) );
+    assert( w.size() == f.size() );
+
+    for (int k = 0; k < Nrows(); ++k) {
+        w[k] = f[k] - _vv[2 * k] * u[_iv[2 * k]] + _vv[2 * k + 1] * u[_iv[2 * k + 1]];
+    }
+    return;
+}
+
+void BisectInterpolation::Debug() const
+{
+    for (int k = 0; k < Nrows(); ++k) {
+        cout << k << " : fathers(" << _iv[2 * k] << "," << _iv[2 * k + 1] << ")    ";
+        cout << "weights(" << _vv[2 * k] << "," << _vv[2 * k + 1] << endl;
+    }
+    cout << endl;
+    return;
+}
+
+int BisectInterpolation::fetch(int row, int col) const
+{
+    int idx(-1);
+    if (_iv[2 * row  ] == col) idx = 2 * row;
+    if (_iv[2 * row + 1] == col) idx = 2 * row + 1;
+    assert(idx >= 0);
+    return idx;
+}
+
+// #####################################################################
+
+//BisectIntDirichlet::BisectIntDirichlet(std::vector<int> const &fathers, std::vector<int> const &idxc_dir)
+    //: BisectInterpolation(fathers)
+//{
+    //vector<bool> bdir(Ncols(), false);               // Indicator for Dirichlet coarse nodes
+    //for (size_t kc = 0; kc < idxc_dir.size(); ++kc) {
+        //bdir.at(idxc_dir[kc]) = true;                // Mark Dirichlet node from coarse mesh
+    //}
+
+    //for (size_t j = 0; j < _iv.size(); ++j) {
+        //if ( bdir.at(_iv[j]) )   _vv[j] = 0.0;       // set weight to zero iff (at least) one father is Dirichlet node
+    //}
+    //return;
+//}
+
+BisectIntDirichlet::BisectIntDirichlet(std::vector<int> const &fathers, std::vector<int> idxc_dir)
+    : BisectInterpolation(fathers), _idxDir(std::move(idxc_dir))
+{
+    //vector<bool> bdir(Ncols(), false);               // Indicator for Dirichlet coarse nodes
+    //for (size_t kc = 0; kc < idxc_dir.size(); ++kc) {
+        //bdir.at(idxc_dir[kc]) = true;                // Mark Dirichlet node from coarse mesh
+    //}
+
+    //for (size_t j = 0; j < _iv.size(); ++j) {
+        //if ( bdir.at(_iv[j]) )   _vv[j] = 0.0;       // set weight to zero iff (at least) one father is Dirichlet node
+    //}
+    return;
+}
+
+BisectIntDirichlet::~BisectIntDirichlet()
+{}
+
+
+void BisectIntDirichlet::MultT(vector<double> const &wf, vector<double> &uc) const
+{
+    BisectInterpolation::MultT(wf, uc);
+    for (int kc : _idxDir) {
+        uc.at(kc) = 0.0;                // Set Dirichlet node on coarse mesh to Zero
+    }
+
+    return;
+}
+
+// #####################################################################
+
+void DefectRestrict(CRS_Matrix const &SK, BisectInterpolation const &P,
+                    vector<double> &fc, vector<double> &ff, vector<double> &uf)
+{
+    assert( P.Nrows() == static_cast<int>(ff.size()) );
+    assert( P.Ncols() == static_cast<int>(fc.size()) );
+    assert( ff.size() == uf.size() );
+    assert( P.Nrows() == SK.Nrows() );
+
+//#pragma omp parallel for
+#pragma omp for
+    for (int k = 0; k < P.Ncols(); ++k)  fc[k] = 0.0;
+
+// GH: atomic slows down the code ==> use different storage for MultT operation (CRS-matrix?)
+    //#pragma omp parallel for
+    #pragma omp for
+    for (int row = 0; row < SK._nrows; ++row) {
+        double wi = ff[row];
+        for (int ij = SK._id[row]; ij < SK._id[row + 1]; ++ij) {
+            wi -= SK._sk[ij] * uf[ SK._ik[ij] ];
+        }
+
+        const int i1 = P._iv[2 * row];
+        const int i2 = P._iv[2 * row + 1];
+        //if (i1 != i2) 
+        {
+            #pragma omp atomic
+            fc[i1] += P._vv[2 * row  ] * wi;
+            #pragma omp atomic
+            fc[i2] += P._vv[2 * row + 1] * wi;
+        }
+        //else {
+            //#pragma omp atomic
+            //fc[i1] += 2.0 * P._vv[2 * row  ] * wi; // uses a property of class BisectInterpolation
+        //}
+    }
+    return;
+}
--- a/mgrid_2/getmatrix.h
+++ b/mgrid_2/getmatrix.h
@ -0,0 +1,688 @@
+#pragma once
+
+#include "geom.h"
+#include <cassert>
+#include <functional>
+#include <string>
+#include <vector>
+// #####################################################################
+/**
+ * Abstract matrix class.
+ */
+class Matrix
+{
+    public:
+       /**
+		 * Constructor for abstract matrix class.
+         *
+         * No memory is allocated.
+		 *
+		 * @param[in] nrows   number of matrix rows.
+		 * @param[in] ncols   number of matrix columns.
+		*/
+       Matrix(int nrows, int ncols);
+
+       //Matrix(Matrix &) = default;
+       Matrix(Matrix const&  org)            = default; // Copy constructor
+       Matrix(Matrix      && org)            = default; // Move constructor
+       Matrix& operator=(Matrix const&  rhs) = default; // Copy assignment
+       Matrix& operator=(Matrix      && rhs) = default; // Move assignment
+       virtual ~Matrix();
+
+       /**
+		 * Checks whether the matrix is a square matrix.
+		 *
+		 * @return True iff square matrix.
+		*/
+       bool isSquare() const
+       { return _nrows==_ncols;}
+
+       /**
+		 * Number of rows in matrix.
+		 * @return number of rows.
+		 */
+       int Nrows() const
+          {return _nrows;}
+
+       /**
+		 * Number of columns in matrix.
+		 * @return number of columns.
+		 */
+       int Ncols() const
+          {return _ncols;}
+
+       /**
+		 * Show the matrix entries.
+		 */
+       virtual void Debug() const = 0;
+
+       /**
+        * Extracts the diagonal elements of an inherited matrix.
+        *
+        * @param[in,out]  d  (prellocated) vector of diagonal elements
+        */
+       virtual void GetDiag(std::vector<double> &d) const = 0;
+
+       /**
+        * Extracts the diagonal elements of the matrix.
+        *
+        * @return  d  vector of diagonal elements
+        */
+       std::vector<double> const & GetDiag() const;
+
+       /**
+        * Performs the matrix-vector product  w := K*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     u vector
+        */
+       virtual void Mult(std::vector<double> &w, std::vector<double> const &u) const = 0;
+
+        /**
+        * Calculates the defect/residuum w := f - K*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     f load vector
+        * @param[in]     u vector
+        */
+       virtual void Defect(
+                   std::vector<double> &w,
+                   std::vector<double> const &f, std::vector<double> const &u) const = 0;
+
+       //virtual void JacobiSmoother(std::vector<double> const &f, std::vector<double> &u,
+                    //std::vector<double> &r, int nsmooth, double const omega, bool zero) const
+       virtual void JacobiSmoother(std::vector<double> const &, std::vector<double> &,
+                    std::vector<double> &, int, double const, bool) const
+       {
+           std::cout << "ERROR in Matrix::JacobiSmoother" << std::endl;
+           assert(false);
+       }
+
+       /**
+		 * Finds in a CRS matrix the access index for an entry at row @p row and column @p col.
+		 *
+		 * @param[in] row	row index
+		 * @param[in] col	column index
+		 * @return index for element (@p row, @p col). If no appropriate entry exists then -1 will be returned.
+		 *
+		 * @warning assert() stops the function in case that matrix element (@p row, @p col) doesn't exist.
+		*/
+       virtual int fetch(int row, int col) const =0;
+
+    protected:
+       int _nrows;              //!< number of rows in matrix
+       int _ncols;              //!< number of columns in matrix
+       mutable std::vector<double> _dd; //!< diagonal matrix elements
+};
+
+// #####################################################################
+class BisectInterpolation;  // class forward declaration
+/**
+ * Matrix in CRS format (compressed row storage; also named CSR),
+ * see an <a href="https://en.wikipedia.org/wiki/Sparse_matrix">introduction</a>.
+ */
+class CRS_Matrix: public Matrix
+{
+    public:
+       /**
+        * Constructor
+        *
+        */
+       CRS_Matrix();
+       
+//! \brief The sparse matrix in CRS format is initialized from a binary file.
+//!
+//!        The binary file has to store 4 Byte integers and 8 Byte doubles and contains the following data:
+//!        - Number of rows
+//!        - Number of non-zero elements/blocks
+//!        - Number of non-zero matrix elements (= previous number * dofs per block)
+//!        - [#elements per row] (counter)
+//!        - [column indices]
+//!        - [matrix elements]
+//!
+//! \param[in]   file name of binary file
+//!
+       explicit CRS_Matrix(const std::string& file);
+
+       CRS_Matrix(CRS_Matrix const&  org)            = default; // Copy constructor
+       CRS_Matrix(CRS_Matrix      && org)            = default; // Move constructor
+       CRS_Matrix& operator=(CRS_Matrix const&  rhs) = default; // Copy assignment
+       CRS_Matrix& operator=(CRS_Matrix      && rhs) = default; // Move assignment
+       ~CRS_Matrix() override;
+
+       /**
+        * Extracts the diagonal elements of the sparse matrix.
+        *
+        * @param[in,out]  d  (prellocated) vector of diagonal elements
+        */
+       void GetDiag(std::vector<double> &d) const override;
+
+       // Solves non-M matrix problems for Jacobi iteration but not for MG
+       void GetDiag_M(std::vector<double> &d) const;
+
+       /**
+        * Performs the matrix-vector product  w := K*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     u vector
+        */
+       void Mult(std::vector<double> &w, std::vector<double> const &u) const override;
+
+        /**
+        * Calculates the defect/residuum w := f - K*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     f load vector
+        * @param[in]     u vector
+        */
+       void Defect(std::vector<double> &w,
+                   std::vector<double> const &f, std::vector<double> const &u) const override;
+
+       void JacobiSmoother(std::vector<double> const &f, std::vector<double> &u,
+                    std::vector<double> &r, int nsmooth, double omega, bool zero) const override;
+
+       /**
+		 * Show the matrix entries.
+		 */
+       void Debug() const override;
+
+       /**
+		 * Finds in a CRS matrix the access index for an entry at row @p row and column @p col.
+		 *
+		 * @param[in] row	row index
+		 * @param[in] col	column index
+		 * @return index for element (@p row, @p col). If no appropriate entry exists then -1 will be returned.
+		 *
+		 * @warning assert() stops the function in case that matrix element (@p row, @p col) doesn't exist.
+		*/
+       int fetch(int row, int col) const override;
+
+        /**
+        * Compare @p this CRS matrix with an external CRS matrix stored in C-Style.
+        *
+        * The method prints statements on differences found.
+        *
+        * @param[in]     nnode  row number of external matrix
+        * @param[in]     id     start indices of matrix rows of external matrix
+        * @param[in]     ik     column indices of external matrix
+        * @param[in]     sk     non-zero values of external matrix
+        *
+        * @return true iff all data are identical.
+        */
+       bool Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const;
+              
+       /**
+		 * Calculates the defect and projects it to the next coarser level @f$ f_C := P^T \cdot (f_F - SK\cdot u_F) @f$.  
+		 *
+		 * @param[in] SK	matrix on fine mesh
+		 * @param[in] P	    prolongation operator
+		 * @param[in,out] fc  resulting coarse mesh vector (preallocated)
+		 * @param[in] ff	r.h.s. on fine mesh
+		 * @param[in] uf	status vector on fine mesh 
+		 *
+		*/
+       friend void DefectRestrict(CRS_Matrix const & SK, BisectInterpolation const& P, 
+       std::vector<double> &fc, std::vector<double> &ff, std::vector<double> &uf);
+       
+//! \brief A sparse matrix in CRS format (counter, column index, value) is written to a binary file.
+//!
+//!        The binary file has to store 4 Byte integers and 8 Byte doubles and contains the following data:
+//!        - Number of rows
+//!        - Number of non-zero elements
+//!        - Number of non-zero elements
+//!        - [#elements per row]
+//!        - [column indices]
+//!        - [elements]
+//!
+//! \param[in]  file name of binary file
+//!
+        void writeBinary(const std::string& file);
+        
+    private:
+//! \brief A sparse matrix in CRS format (counter, column index, value) is read from a binary file.
+//!
+//!        The binary file has to store 4 Byte integers and 8 Byte doubles and contains the following data:
+//!        - Number of rows
+//!        - Number of non-zero elements/blocks
+//!        - Number of non-zero matrix elements (= previous number * dofs per block)
+//!        - [#elements per row]
+//!        - [column indices]
+//!        - [matrix elements]
+//!
+//! \param[in]   file name of binary file
+//!
+        void readBinary(const std::string& file);       
+    
+    public:
+    //private:
+        /**
+        * Checks matrix symmetry.
+        * @return true iff matrix is symmetric.
+        */    
+       bool CheckSymmetry() const;
+       
+        /**
+        * Checks whether the sum of all entries in each separate row is zero.
+        * @return true/false
+        */        
+       bool CheckRowSum() const;
+       
+        /**
+        * Checks M-matrix properties.
+        * @return true/false
+        */         
+       bool CheckMproperty() const;
+       
+        /**
+        * Checks for several matrix properties, as row sum and M-matrix
+        * @return true/false
+        */   
+       bool CheckMatrix() const;
+ 
+        /**
+        * Changes the given matrix into an M-matrix.
+        * The posive off diagonal enries of a row are lumped (added) 
+        * to the main diagonal entry of that row.
+        * 
+        * @return true iff changes have been neccessary.
+        */         
+       bool ForceMproperty();    
+
+    protected:
+       int _nnz;                //!< number of non-zero entries
+       std::vector<int> _id;    //!< start indices of matrix rows
+       std::vector<int> _ik;    //!< column indices
+       std::vector<double> _sk; //!< non-zero values
+};
+
+
+/**
+ * FEM Matrix in CRS format (compressed row storage; also named CSR),
+ * see an <a href="https://en.wikipedia.org/wiki/Sparse_matrix">introduction</a>.
+ */
+class FEM_Matrix: public CRS_Matrix
+{
+    public:
+       /**
+        * Initializes the CRS matrix structure from the given discretization in @p mesh.
+        *
+        * The sparse matrix pattern is generated but the values are 0.
+        *
+        * @param[in] mesh given discretization
+        *
+        * @warning A reference to the discretization @p mesh is stored inside this class.
+        *          Therefore, changing @p mesh outside requires also
+        *          to call method @p Derive_Matrix_Pattern explicitly.
+        *
+        * @see Derive_Matrix_Pattern
+        */
+       explicit FEM_Matrix(Mesh const & mesh);
+
+       FEM_Matrix(FEM_Matrix const &) = default;
+
+      /**
+        * Destructor.
+        */
+       ~FEM_Matrix() override;
+
+       /**
+        * Generates the sparse matrix pattern and overwrites the existing pattern.
+        *
+        * The sparse matrix pattern is generated but the values are 0.
+       */
+       void Derive_Matrix_Pattern()
+       {
+           //Derive_Matrix_Pattern_slow();
+           Derive_Matrix_Pattern_fast();
+           CheckRowSum();
+       }
+       void Derive_Matrix_Pattern_fast();
+       void Derive_Matrix_Pattern_slow();
+
+
+        /**
+        * Calculates the entries of f.e. stiffness matrix for the Laplace operator
+        * and load/rhs vector @p f.
+        * No memory is allocated.
+        *
+        * @param[in,out] f (preallocated) rhs/load vector
+        */
+       void CalculateLaplace(std::vector<double> &f);
+
+        ///**
+        //* Calculates the entries of f.e. stiffness matrix for the Laplace operator
+        //* and load/rhs vector @p f according to function @p f_func,
+        //*
+        //* @param[in,out] f (preallocated) rhs/load vector
+        //* @param[in]     f_func function f(x,y,z)
+        //*/
+       //void CalculateLaplace(std::vector<double> &f, const std::function<double(double, double, double)> &f_func);       
+
+        /**
+        * Calculates the entries of load/rhs vector @p f.
+        * No memory is allocated.
+        *
+        * @param[in,out] f    (preallocated) rhs/load vector
+        * @param[in]     func continuous function f(x,y)
+        */
+       void CalculateRHS(std::vector<double> &f, const std::function<double(double,double)> &func);
+
+       /**
+        * Applies Dirichlet boundary conditions to stiffness matrix and to load vector @p f.
+        * The <a href="https://www.jstor.org/stable/2005611?seq=1#metadata_info_tab_contents">penalty method</a>
+        * is used for incorporating the given values @p u.
+        *
+        * @param[in]     u (global) vector with Dirichlet data
+        * @param[in,out] f load vector
+        */
+       void ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f);
+
+       /**
+        * Extracts the diagonal elements of the sparse matrix.
+        *
+        * @param[in,out]  d  (prellocated) vector of diagonal elements
+       */
+       //void GetDiag(std::vector<double> &d) const;   // override in MPI parallel
+       void GetDiag(std::vector<double> &d) const override  { GetDiag_M(d); }
+       
+       // Solves non-M matrix problems for Jacobi iteration but not for MG
+       //void GetDiag_M(std::vector<double> &d) const;
+
+
+      /**
+        * Adds the element stiffness matrix @p ske and the element load vector @p fe
+        * of one triangular element with linear shape functions to the appropriate positions in
+        * the stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik).
+        *
+        * @param[in]     ial   node indices of the three element vertices
+        * @param[in]     ske   element stiffness matrix
+        * @param[in]     fe    element load vector
+        * @param[in,out] f	   distributed local vector storing the right hand side
+        *
+        * @warning Algorithm assumes linear triangular elements (ndof_e==3).
+       */
+       void AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], std::vector<double> &f);
+
+      /**
+        * Adds the the element load vector @p fe
+        * of one triangular element with linear shape functions to the appropriate positions in
+        * the right hand side @p f.
+        *
+        * @param[in]     ial   node indices of the three element vertices
+        * @param[in]     fe    element load vector
+        * @param[in,out] f	   distributed local vector storing the right hand side
+        *
+        * @warning Algorithm assumes linear triangular elements (ndof_e==3).
+       */
+       void AddElemRHS_3(int const ial[3], double const fe[3], std::vector<double> &f);
+
+
+    private:
+       Mesh const & _mesh;      //!< reference to discretization
+
+};
+
+
+///**
+ //* Prolongation matrix in CRS format (compressed row storage; also named CSR),
+ //* see an <a href="https://en.wikipedia.org/wiki/Sparse_matrix">introduction</a>.
+ //*
+ //* The prolongation is applied for each node from the coarse mesh to the fine mesh and
+ //* is derived only geometrically (no operator weighted prolongation).
+ //*/
+//class Prolongation: public CRS_Matrix
+//{
+    //public:
+       ///**
+        //* Intializes the CRS matrix structure from the given discetization in @p mesh.
+        //*
+        //* The sparse matrix pattern is generated but the values are 0.
+        //*
+        //* @param[in] cmesh coarse mesh
+        //* @param[in] fmesh fine mesh
+        //*
+        //* @warning A reference to the discretizations @p fmesh  @p cmesh are stored inside this class.
+        //*          Therefore, changing these meshes outside requires also
+        //*          to call method @p Derive_Matrix_Pattern explicitely.
+        //*
+        //* @see Derive_Matrix_Pattern
+        //*/
+       //Prolongation(Mesh const & cmesh, Mesh const & fmesh);
+
+       ///**
+        //* Destructor.
+        //*/
+       //~Prolongation() override
+       //{}
+
+       ///**
+        //* Generates the sparse matrix pattern and overwrites the existing pattern.
+        //*
+        //* The sparse matrix pattern is generated but the values are 0.
+       //*/
+       //void Derive_Matrix_Pattern() override;
+
+    //private:
+       //Mesh const & _cmesh;      //!< reference to coarse discretization
+       //Mesh const & _fmesh;      //!< reference to fine discretization
+//};
+
+// *********************************************************************
+
+
+/**
+ * Interpolation matrix for prolongation coarse mesh (C) to a fine mesh (F)
+ * generated by bisecting edges.
+ *
+ * All interpolation weights are 0.5 (injection points contribute twice).
+ */
+class BisectInterpolation: public Matrix
+{
+    public:
+       /**
+        * Generates the interpolation matrix for prolongation coarse mesh to a fine mesh
+        * generated by bisecting edges.
+        * The interpolation weights are all 0.5.
+        *
+        * @param[in] fathers vector[nnodes][2] containing
+        *                    the two coarse grid fathers of a fine grid vertex
+        *
+        */
+       explicit BisectInterpolation(std::vector<int> const & fathers);
+       BisectInterpolation();
+
+       BisectInterpolation(BisectInterpolation const&  org)            = default; // Copy constructor
+       BisectInterpolation(BisectInterpolation      && org)            = default; // Move constructor
+       BisectInterpolation& operator=(BisectInterpolation const&  rhs) = default; // Copy assignment
+       BisectInterpolation& operator=(BisectInterpolation      && rhs) = default; // Move assignment
+       ~BisectInterpolation() override;
+
+       /**
+        * Extracts the diagonal elements of the matrix.
+        *
+        * @param[in,out]  d  (prellocated) vector of diagonal elements
+        */
+       void GetDiag(std::vector<double> &d) const override;
+       ///**
+        //* Extracts the diagonal elements of the sparse matrix.
+        //*
+        //* @return  d  vector of diagonal elements
+        //*/
+       //std::vector<double> const & GetDiag() const override;
+
+       /**
+        * Performs the prolongation  @f$ w_F := P*u_C @f$.
+        *
+        * @param[in,out] wf resulting fine vector (preallocated)
+        * @param[in]     uc coarse vector
+        */
+       void Mult(std::vector<double> &wf, std::vector<double> const &uc) const override;
+
+       /**
+        * Performs the restriction  @f$ u_C := P^T*w_F @f$.
+        *
+        * @param[in]         wf fine vector
+        * @param[in,out]     uc resulting coarse vector (preallocated)
+        */
+       virtual void MultT(std::vector<double> const &wf, std::vector<double> &uc) const;
+       
+        /**
+        * Performs the full restriction  @f$ u_C := F^{-1}*P^T*w_F @f$.
+        * 
+        * @f$ F @f$ denotes the row sum of the restriction matrix 
+        * and results in restricting exactly a bilinear function from the fine grid onto 
+        * the same bilinear function on the coarse grid.
+        *
+        * @param[in]         wf fine vector
+        * @param[in,out]     uc resulting coarse vector (preallocated)
+        */
+       void MultT_Full(std::vector<double> const &wf, std::vector<double> &uc) const;
+
+
+        /**
+        * Calculates the defect/residuum w := f - P*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     f load vector
+        * @param[in]     u coarse vector
+        */
+       void Defect(std::vector<double> &w,
+                   std::vector<double> const &f, std::vector<double> const &u) const override;
+
+       /**
+		 * Show the matrix entries.
+		 */
+       void Debug() const override;
+
+       /**
+		 * Finds in this matrix the access index for an entry at row @p row and column @p col.
+		 *
+		 * @param[in] row	row index
+		 * @param[in] col	column index
+		 * @return index for element (@p row, @p col). If no appropriate entry exists then -1 will be returned.
+		 *
+		 * @warning assert() stops the function in case that matrix element (@p row, @p col) doesn't exist.
+		*/
+       int fetch(int row, int col) const override;
+
+       /**
+		 * Calculates the defect and projects it to the next coarser level @f$ f_C := P^T \cdot (f_F - SK\cdot u_F) @f$.  
+		 *
+		 * @param[in] SK	matrix on fine mesh
+		 * @param[in] P	    prolongation operator
+		 * @param[in,out] fc  resulting coarse mesh vector (preallocated)
+		 * @param[in] ff	r.h.s. on fine mesh
+		 * @param[in] uf	status vector on fine mesh 
+		 *
+		*/       
+       friend void DefectRestrict(CRS_Matrix const & SK, BisectInterpolation const& P, 
+       std::vector<double> &fc, std::vector<double> &ff, std::vector<double> &uf);
+
+    protected:
+       std::vector<int> _iv;     //!< fathers[nnode][2] of fine grid nodes, double entries denote injection points
+       std::vector<double> _vv;  //!< weights[nnode][2] of fathers for grid nodes
+};
+
+/**
+ * Interpolation matrix for prolongation from coarse mesh (C)) to a fine mesh (F)
+ * generated by bisecting edges.
+ * 
+ * We take into account that values at Dirichlet nodes have to be preserved, i.e.,
+ * @f$ w_F = P \cdot I_D \cdot w_C @f$ and @f$ d_C = I_D  \cdot P^T \cdot  d_F@f$
+ * with @f$ I_D @f$ as @f$ n_C \times n_C @f$ diagonal matrix and entries
+ * @f$ I_{D(j,j)} := \left\{{\begin{array}{l@{\qquad}l} 0 & x_{j}\;\;  \textrm{is Dirichlet node} \\ 1 & \textrm{else} \end{array}}\right. @f$
+ *
+ * Interpolation weights are eighter 0.5 or 0.0 in case of coarse Dirichlet nodes
+ * (injection points contribute twice),
+ * Sets weight to zero iff (at least) one father nodes is a Dirichlet node.
+ */
+class BisectIntDirichlet: public BisectInterpolation
+{
+    public:
+       /**
+		 * Default constructor.
+		*/
+       BisectIntDirichlet()
+        : BisectInterpolation(), _idxDir()
+       {}
+
+       /**
+		 * Constructs interpolation from father-@p row and column @p col.
+		 *
+		 * @param[in] fathers	two father nodes from each fine node [nnode_f*2].
+		 * @param[in] idxc_dir	vector containing the indices of coarse mesh Dirichlet nodes.
+		 *
+		*/
+       BisectIntDirichlet(std::vector<int> const & fathers, std::vector<int> idxc_dir);
+
+       BisectIntDirichlet(BisectIntDirichlet const&  org)            = default; // Copy constructor
+       BisectIntDirichlet(BisectIntDirichlet      && org)            = default; // Move constructor
+       BisectIntDirichlet& operator=(BisectIntDirichlet const&  rhs) = delete; // Copy assignment
+       BisectIntDirichlet& operator=(BisectIntDirichlet      && rhs) = delete;  // Move assignment
+       ~BisectIntDirichlet() override;       
+       
+      /**
+        * Performs the restriction  @f$ u_C := P^T*w_F @f$.
+        *
+        * @param[in]         wf fine vector
+        * @param[in,out]     uc resulting coarse vector (preallocated)
+        */
+       void MultT(std::vector<double> const &wf, std::vector<double> &uc) const override;
+       
+    private:
+       std::vector<int> const _idxDir;
+};
+
+
+
+// *********************************************************************
+
+/**
+ * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+ * of one triangular element with linear shape functions.
+ * @param[in]	ial	node indices of the three element vertices
+ * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coordinates of node k
+ * @param[out] ske	element stiffness matrix
+ * @param[out] fe	element load vector
+ */
+void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3]);
+
+/**
+ * Calculates the element mass matrix @p ske.
+ * of one triangular element with linear shape functions.
+ * @param[in]	ial	node indices of the three element vertices
+ * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coordinates of node k
+ * @param[out]  ske element stiffness matrix
+ */
+void CalcElem_Masse(int const ial[3], double const xc[], double ske[3][3]);
+
+/**
+ * Calculates element load vector @p fe of one triangular element with linear shape functions.
+ * @param[in]	ial	 node indices of the three element vertices
+ * @param[in]	xc	 vector of node coordinates with x(2*k,2*k+1) as coordinates of node k
+ * @param[out]  fe   element load vector
+ * @param[in]   func continuous function f(x,y)
+*/
+void CalcElem_RHS(int const ial[3], double const xc[], double fe[3],
+             const std::function<double(double,double)> &func);
+
+/**
+ * Adds the element stiffness matrix @p ske and the element load vector @p fe
+ * of one triangular element with linear shape functions to the appropriate positions in
+ * the symmetric stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik)
+ *
+ * @param[in] ial   node indices of the three element vertices
+ * @param[in] ske	element stiffness matrix
+ * @param[in] fe	element load vector
+ * @param[out] sk	vector non-zero entries of CSR matrix
+ * @param[in] id	index vector containing the first entry in a CSR row
+ * @param[in] ik	column index vector of CSR matrix
+ * @param[out] f	distributed local vector storing the right hand side
+ *
+ * @warning Algorithm requires indices in connectivity @p ial in ascending order.
+ *          Currently deprecated.
+*/
+void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
+             int const id[], int const ik[], double sk[], double f[]);
+
+
--- a/mgrid_2/jacsolve.cpp
+++ b/mgrid_2/jacsolve.cpp
@ -0,0 +1,331 @@
+#include "vdop.h"
+#include "geom.h"
+#include "getmatrix.h"
+#include "jacsolve.h"
+#include "userset.h"
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <vector>
+using namespace std;
+
+// #####################################################################
+// 	const int neigh[], const int color,
+// 	const MPI::Intracomm& icomm,
+void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
+{
+    const double omega   = 1.0;
+    //const int    maxiter = 1000;
+    const int    maxiter = 240; // GH
+    const double tol  = 1e-6;                // tolerance
+    const double tol2 = tol * tol;           // tolerance^2
+
+    int nrows = SK.Nrows();                  // number of rows == number of columns
+    assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
+
+    cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s"  << endl;
+    //  Choose initial guess
+    for (int k = 0; k < nrows; ++k) {
+        u[k] = 0.0;                          //  u := 0
+    }
+
+    vector<double> dd(nrows);                // matrix diagonal
+    vector<double>  r(nrows);                // residual
+    vector<double>  w(nrows);                // correction
+
+    SK.GetDiag(dd);                          //  dd := diag(K)
+    ////DebugVector(dd);{int ijk; cin >> ijk;}
+
+    //  Initial sweep
+    SK.Defect(r, f, u);                      //  r := f - K*u
+
+    vddiv(w, r, dd);                         //  w := D^{-1}*r
+    const double sigma0 = dscapr(w, r);      // s0 := <w,r>
+
+    // Iteration sweeps
+    int iter  = 0;
+    double sigma = sigma0;
+    while ( sigma > tol2 * sigma0 && maxiter > iter)  // relative error
+        //while ( sigma > tol2 && maxiter > iter)         // absolute error
+    {
+        ++iter;
+        vdaxpy(u, u, omega, w );             //  u := u + om*w
+        SK.Defect(r, f, u);                  //  r := f - K*u
+        vddiv(w, r, dd);                     //  w := D^{-1}*r
+        double sig_old=sigma;
+        sigma = dscapr(w, r);                // s0 := <w,r>
+      	//cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
+        if (sigma>sig_old) cout << "Divergent at iter " << iter << endl;    // GH
+    }
+    cout << "aver. Jacobi rate :  " << exp(log(sqrt(sigma / sigma0)) / iter) << "  (" << iter << " iter)" << endl;
+    cout << "final error: " << sqrt(sigma / sigma0) << " (rel)   " << sqrt(sigma) << " (abs)\n";
+
+    return;
+}
+
+
+
+void JacobiSmoother(Matrix const &SK, std::vector<double> const &f, std::vector<double> &u,
+                    std::vector<double> &r, int nsmooth, double const omega, bool zero)
+{
+    //// ToDO: ensure compatible dimensions
+    SK.JacobiSmoother(f, u, r, nsmooth, omega, zero);
+    return;
+
+    //int const nnodes = static_cast<int>(u.size());
+    //if (zero) {            // assumes initial solution is zero
+        //DiagPrecond(SK, f, u, omega);
+        //--nsmooth;                           // first smoothing sweep done
+    //}
+    ////cout << zero << endl;
+
+    //auto const &D = SK.GetDiag();            // accumulated diagonal of matrix @p SK.
+    ////auto const D = SK.GetDiag();            // accumulated diagonal of matrix @p SK.
+    //for (int ns = 1; ns <= nsmooth; ++ns) {
+        //SK.Defect(r, f, u);                  //  r := f - K*u
+//#pragma omp parallel for    
+        //for (int k = 0; k < nnodes; ++k) {
+            //// u := u + om*D^{-1}*r
+            //u[k] = u[k] + omega * r[k] / D[k]; // MPI: distributed to accumulated vector needed
+        //}
+    //}
+
+    //return;
+}
+
+void DiagPrecond(Matrix const &SK, std::vector<double> const &r, std::vector<double> &w,
+                 double const omega)
+{
+    // ToDO: ensure compatible dimensions
+    auto const &D = SK.GetDiag();        // accumulated diagonal of matrix @p SK.
+    int const nnodes = static_cast<int>(w.size());
+//#pragma omp parallel for    
+#pragma omp for    
+    for (int k = 0; k < nnodes; ++k) {
+        w[k] = omega * r[k] / D[k];      // MPI: distributed to accumulated vector needed
+    }
+
+    return;
+}
+
+
+Multigrid::Multigrid(Mesh const &cmesh, int const nlevel)
+    : _meshes(cmesh, nlevel),
+      _vSK(), _u(_meshes.size()), _f(_meshes.size()), _d(_meshes.size()), _w(_meshes.size()),
+      _vPc2f()
+{
+    cout << "\n........................  in Multigrid::Multigrid  ..................\n";
+    // Allocate Memory for matrices/vectors on all levels
+    for (size_t lev = 0; lev < Nlevels(); ++lev) {
+        _vSK.emplace_back(_meshes[lev] );  // CRS matrix
+        const auto nn = _vSK[lev].Nrows();
+        _u[lev].resize(nn);
+        _f[lev].resize(nn);
+        _d[lev].resize(nn);
+        _w[lev].resize(nn);
+        auto vv = _meshes[lev].GetFathersOfVertices();
+        cout << vv.size() << endl;
+    }
+    // Intergrid transfer operators
+    //cout << "\n........................  in Multigrid::Multigrid  Prolongation ..................\n";
+    //_vPc2f.push_back( BisectInterpolation(vector<int>(0)) ); // no prolongation to the coarsest grid
+    _vPc2f.emplace_back( ); // no prolongation to the coarsest grid
+    for (size_t lev = 1; lev < Nlevels(); ++lev) {
+                    //cout << lev << endl;
+                    //cout << _meshes[lev].GetFathersOfVertices () << endl;
+        //_vPc2f.push_back( BisectInterpolation( _meshes[lev].GetFathersOfVertices () )  );
+        _vPc2f.emplace_back( _meshes[lev].GetFathersOfVertices (), _meshes[lev-1].Index_DirichletNodes ()   );
+                    //cout << _vPc2f.back().Nrows() << "  " << _vPc2f.back().Ncols() << endl;
+        //checkInterpolation(lev);
+        //checkRestriction(lev);
+    }
+    cout << "\n..........................................\n";
+}
+
+Multigrid::~Multigrid()
+{}
+
+void Multigrid::DefineOperators()
+{
+    for (size_t lev = 0; lev < Nlevels(); ++lev) {
+        DefineOperator(lev);
+    }
+    return;
+}
+
+#include "omp.h"
+void Multigrid::DefineOperator(size_t lev)
+{
+    double tstart = omp_get_wtime(); 
+    _vSK[lev].CalculateLaplace(_f[lev]);  // fNice()  in userset.h
+    double t1 = omp_get_wtime() - tstart;             // OpenMP
+    cout << "CalculateLaplace: timing in sec. : " << t1 << "   in level " << lev << endl;
+
+    if (lev == Nlevels() - 1) {                // fine mesh
+        _meshes[lev].SetValues(_u[lev], [](double x, double y) -> double
+        { return x *x * std::sin(2.5 * M_PI * y); }
+                              );
+    }
+    else {
+        _meshes[lev].SetValues(_u[lev], f_zero);
+    }
+
+//// GH:
+    ////_vSK[lev].CheckRowSum();
+    ////if (!_vSK[lev].CheckMproperty()) _vSK[lev].ForceMproperty();
+    
+    //assert(_vSK[lev].CheckSymmetry());
+    //assert(_vSK[lev].CheckRowSum());
+    ////assert(_vSK[lev].CheckMproperty());
+    //_vSK[lev].CheckMproperty();
+//// HG
+    
+    _vSK[lev].ApplyDirichletBC(_u[lev], _f[lev]);
+
+    return;
+}
+
+void Multigrid::JacobiSolve(size_t lev)
+{
+    assert(lev < Nlevels());
+    ::JacobiSolve(_vSK[lev], _f[lev], _u[lev]);
+}
+
+void Multigrid::MG_Step(size_t lev, int const pre_smooth, bool const bzero, int nu)
+{
+    assert(lev < Nlevels());
+    int const post_smooth = pre_smooth;
+
+    if (lev == 0) { // coarse level
+        // GH: a factorization (once in setup) with repeated forward-backward substitution would be better
+        int n_jacobi_iterations = GetMesh(lev).Nnodes()/10; // ensure accuracy for coarse grid silver
+        //#pragma omp parallel
+        JacobiSmoother(_vSK[lev], _f[lev], _u[lev], _d[lev],  n_jacobi_iterations, 1.0, true);
+        //JacobiSmoother(_vSK[lev], _f[lev], _u[lev], _d[lev],  1000, 1.0, false);
+    }
+    else {
+        //#pragma omp parallel
+        JacobiSmoother(_vSK[lev], _f[lev], _u[lev], _d[lev],  pre_smooth, 0.85, bzero || lev < Nlevels()-1);
+        //JacobiSmoother(_vSK[lev], _f[lev], _u[lev], _d[lev],  pre_smooth, 0.85, bzero);
+
+        if (nu > 0) {
+            //#pragma omp single
+            //cout << "AA\n";
+        //#pragma omp parallel
+            _vSK[lev].Defect(_d[lev], _f[lev], _u[lev]);   //   d := f - K*u
+            //#pragma omp single
+            //cout << "BB\n";            
+        //#pragma omp parallel
+            _vPc2f[lev].MultT(_d[lev], _f[lev - 1]);       // f_H := R*d
+            // faster than Defect+MultT, slightly different final error (80 bit register for wi ?)
+            //DefectRestrict(_vSK[lev], _vPc2f[lev], _f[lev - 1], _f[lev], _u[lev]); // f_H := R*(f - K*u)
+            
+            //cout << "f fine  " << endl; GetMesh(lev).Visualize(_f[lev]);
+            //cout << "u fine  " << endl; GetMesh(lev).Visualize(_u[lev]);
+            //cout << "d fine  " << endl; GetMesh(lev).Visualize(_d[lev]);
+            //cout << "f coarse" << endl; GetMesh(lev-1).Visualize(_f[lev - 1]);
+
+                    //_meshes[lev-1].Visualize(_f[lev - 1]);        // GH: Visualize: f_H should be 0 on Dirichlet B.C.
+            //#pragma omp single
+            //cout << "CC\n";
+            MG_Step(lev - 1, pre_smooth, true, nu);       // solve  K_H * u_H =f_H  with u_H:=0
+            for (int k = 1; k < nu; ++k) {
+                // W-cycle
+                MG_Step(lev - 1, pre_smooth, false, nu);  // solve  K_H * u_H =f_H
+            }
+            //#pragma omp single
+            //cout << "DD\n";
+        //#pragma omp parallel
+            _vPc2f[lev].Mult(_w[lev], _u[lev - 1]);        // w := P*u_H
+
+        //#pragma omp parallel
+            vdaxpy(_u[lev], _u[lev], 1.0, _w[lev] );      // u := u + tau*w
+        }
+
+        //#pragma omp parallel
+        JacobiSmoother(_vSK[lev], _f[lev], _u[lev], _d[lev],  post_smooth, 0.85, false);
+    }
+    return;
+}
+
+void Multigrid::MG_Solve(int pre_smooth, double eps, int nu)
+{
+    size_t lev=Nlevels()-1;                // fine level
+
+    double s0(-1);
+    double si(0);
+    #pragma omp parallel shared(s0,si)
+    {
+    // start with zero guess
+    DiagPrecond(_vSK[lev], _f[lev], _u[lev], 1.0);  // w   := D^{-1]*f
+    //_u[lev] = _w[lev];
+   
+    //double s0(-1), si(0);
+    //#pragma omp parallel shared(s0,si)
+    //{
+    //double s0 = L2_scapr(_f[lev],_w[lev]);         // s_0 := <f,w>
+    //double s0 = dscapr(_f[lev],_w[lev]);         // s_0 := <f,w>
+    //#pragma omp parallel
+    //dscapr(_f[lev],_w[lev],s0);         // s_0 := <f,w>
+    dscapr(_f[lev],_u[lev],s0);         // s_0 := <f,u>
+    //s0 =  dscapr(_f[lev],_w[lev]);
+    //double si;
+
+    bool bzero = true;                       // start with zero guess
+    int  iter  = 0;
+    do
+    {
+        //#pragma omp parallel
+        MG_Step(lev, pre_smooth, bzero, nu);
+        bzero=false;
+        //#pragma omp parallel
+        _vSK[lev].Defect(_d[lev], _f[lev], _u[lev]);    //   d := f - K*u
+        //#pragma omp parallel
+        DiagPrecond(_vSK[lev], _d[lev], _w[lev], 1.0);  // w   := D^{-1]*d
+        //si = L2_scapr(_d[lev],_w[lev]);                // s_i := <d,w>
+        //si = dscapr(_d[lev],_w[lev]);                // s_i := <d,w>
+        //#pragma omp parallel
+        dscapr(_d[lev],_w[lev], si);                // s_i := <d,w>
+        //si = dscapr(_d[lev],_w[lev]);
+        ++iter;
+    } while (si>s0*eps*eps && iter<1000);
+
+    #pragma omp single
+    cout << "\nrel. error: " << sqrt(si/s0) << "  ( " << iter << " iter.)" << endl;
+    }
+
+    return;
+}
+
+[[maybe_unused]] bool Multigrid::checkInterpolation(size_t const lev)
+{
+    assert(1<=lev && lev<Nlevels());
+    _meshes[lev-1].SetValues(_w[lev-1], [](double x, double y) -> double
+                           { return x+y; }  );
+    _meshes[lev].SetValues(_w[lev], [](double /*x*/, double /*y*/) -> double
+                           { return -123.0; }  );
+    //static_cast<BisectInterpolation>(_vPc2f[lev]).Mult(_d[lev], _w[lev - 1]);        // d := P*w_H
+    _vPc2f[lev].Mult(_d[lev], _w[lev - 1]);        // d := P*w_H
+    
+    cout << "înterpolated  " << endl; GetMesh(lev).Visualize(_d[lev]);
+    
+    return true;
+}
+
+[[maybe_unused]] bool Multigrid::checkRestriction(size_t const lev)
+{
+    assert(1<=lev && lev<Nlevels());
+    _meshes[lev].SetValues(_d[lev], [](double x, double y) 
+                           { return x+y; }  );
+    _meshes[lev-1].SetValues(_w[lev-1], [](double /*x*/, double /*y*/) -> double
+                           { return -123.0; }  );
+    //static_cast<BisectInterpolation>(_vPc2f[lev]).MultT(_d[lev], _w[lev - 1]);        // w_H := R*d
+    _vPc2f[lev].MultT(_d[lev], _w[lev - 1]);        // w_H := R*d
+    
+    cout << "restricted  " << endl; GetMesh(lev-1).Visualize(_w[lev-1]);
+    
+    return true;
+}
+
+
--- a/mgrid_2/jacsolve.h
+++ b/mgrid_2/jacsolve.h
@ -0,0 +1,151 @@
+#pragma once
+#include "geom.h"
+#include "getmatrix.h"
+#include <vector>
+
+
+/**
+ * Solves linear system of equations  K @p u = @p f  via the Jacobi iteration.
+ * We use a distributed symmetric  CSR matrix @p SK and initial guess of the
+ * solution is set to 0.
+ * @param[in] SK	CSR matrix
+ * @param[in] f		distributed local vector storing the right hand side
+ * @param[out] u	accumulated local vector storing the solution.
+*/
+void JacobiSolve(CRS_Matrix const &SK, std::vector<double> const &f, std::vector<double> &u);
+
+
+/**
+ * Solves linear system of equations  K @p u = @p f  via the Jacobi iteration.
+ * We use a distributed symmetric  CSR matrix @p SK and initial guess of the
+ * solution is set to 0.
+ *
+ * In each smoothing step: @f$ \widehat{u} := u + \omega D^{-1}\left({f-K*u}\right) @f$
+ *
+ * @param[in]  SK	CSR matrix
+ * @param[in]  f	distributed local vector storing the right hand side
+ * @param[out] u	accumulated local vector storing the solution.
+ * @param[in,out] r	auxiliary local vector.
+ * @param[in] nsmooth	number of smoothing steps.
+ * @param[in] omega	relaxation parameter.
+ * @param[in] zero	initial solution @p u is assumed to be zero.
+*/
+void JacobiSmoother(Matrix const &SK, std::vector<double> const &f, std::vector<double> &u,
+                    std::vector<double> & r, int nsmooth=1, double omega=1.0, bool zero=false);
+
+/**
+ * @brief Simple diagonale preconditioning.
+ *
+ * The residuum @p r scaled by the inverse diagonal of matríx @p SK results in the correction @p w.
+ *
+ * @f$ w :=  \omega D^{-1}*r @f$
+ *
+ * @param[in]  SK	matrix
+ * @param[in]  r	distributed local vector storing the residuum
+ * @param[out] w	accumulated local vector storing the correction.
+ * @param[in] omega	relaxation parameter.
+*/
+void DiagPrecond(Matrix const &SK, std::vector<double> const &r, std::vector<double> &w,
+                 double omega=1.0);
+
+
+
+/**
+ * @brief The Multigrid hierarchy including meshes, vectors and matrices, prolongations is stored.
+*/
+class Multigrid
+{
+    public:
+       /**
+        * Generates the mesh hierachy with @p nlevel meshes starting from coarse mesh @p cmesg .
+        *
+        * The refined meshes are generated by edge bisection.
+        * All memory is allocated but stiffness matrices are yet not calculated
+	    *
+        * @param[in] cmesh	  initial coarse mesh
+        * @param[in] nlevel  number of meshes in hierarchy, including the initial coarse mesh
+        *
+		*/
+       Multigrid(Mesh const& cmesh, int nlevel);
+
+       Multigrid(Multigrid const&)            = delete;
+       Multigrid& operator=(Multigrid const&) = delete;
+
+       ~Multigrid();
+
+       /**
+		 * @return  Number of meshes in hierarchy.
+		 */
+       [[nodiscard]] size_t Nlevels() const
+       {return _meshes.size(); }
+
+       /**
+		 * @return  Number of Unknowns.
+		 */
+       [[nodiscard]] int Ndofs() const
+       {return _meshes[Nlevels()-1].Nnodes(); }
+
+       /**
+		 * @return  Meshes number @p lev .
+		*/
+       [[nodiscard]] Mesh const& GetMesh(int lev) const
+       { return _meshes[lev]; }
+
+       /**
+		 * @return  Solution vector at level @p lev .
+		 */
+       [[nodiscard]] std::vector<double> const&  GetSolution(int lev) const
+       { return _u.at(lev); }
+
+       /**
+		 * Calculates PDE matrices for all levels.
+		 */
+       void DefineOperators();
+
+       /**
+        * Calculates PDE matrix for level @p lev.
+        *
+        * @param[in] lev  level in hierachy
+		*/
+       void DefineOperator(size_t lev);
+
+       /**
+        * Solves the system of equations at level @p lev via Jacobi iterations
+        *
+        * @param[in] lev  level in hierachy
+		*/
+       void JacobiSolve(size_t lev);
+
+      /**
+       * Peforms one multigrid step at level @p lev .
+       *
+       * @param[in] lev         level in hierachy
+       * @param[in] pre_smooth  number of pre/post-smoothing sweeps
+       * @param[in] bzero       start with zero-vector as solution
+       * @param[in] nu          defines the multigrid cycle (1/2 = V/W)
+       */
+       void MG_Step(size_t lev, int pre_smooth=1, bool bzero=false, int nu=1);
+
+      /**
+       * Solves the system of equations at finest level via multigrid
+         *
+         * @param[in] pre_smooth  number of pre/post-smoothing sweeps
+         * @param[in] eps         stopping criteria (relative error)
+         * @param[in] nu          defines the multigrid cycle (1/2 = V/W)
+		 */
+       void MG_Solve(int pre_smooth=1, double eps=1e-6, int nu=1);
+
+    [[maybe_unused]] bool checkInterpolation(size_t lev);
+
+    [[maybe_unused]] bool checkRestriction(size_t lev);
+
+
+    private:
+       gMesh_Hierarchy _meshes;                 //!< mesh hierarchy from coarse (level 0) to fine.
+       std::vector<FEM_Matrix>           _vSK;  //!< Sparse matrix on each level.
+       std::vector<std::vector<double>>   _u;   //!< Solution vector on each level.
+       std::vector<std::vector<double>>   _f;   //!< Right hand side vector on each level.
+       std::vector<std::vector<double>>   _d;   //!< Defect vector on each level.
+       std::vector<std::vector<double>>   _w;   //!< Correction vector on each level.
+       std::vector<BisectIntDirichlet> _vPc2f;  //!< Interpolation to level from next coarser level.
+};
--- a/mgrid_2/main.cpp
+++ b/mgrid_2/main.cpp
@ -0,0 +1,141 @@
+#include "geom.h"
+#include "getmatrix.h"
+#include "jacsolve.h"
+#include "userset.h"
+#include "vdop.h"
+
+#include <cassert>
+#include <chrono>           // timing
+#include <cmath>
+#include <iostream>
+#include <omp.h>
+using namespace std;
+using namespace std::chrono;  // timing
+
+
+void Test_solver(int lev, vector<int> const &nthreads, Multigrid &ggm);
+
+int main(int argc, char **argv )
+{
+#undef MG
+#ifndef MG
+    // Jacobi iteration
+    int nrefine = 0;
+    if (argc > 1)  nrefine = atoi(argv[1]);
+
+    //Mesh const mesh_c("square_tiny.txt");
+    Mesh const mesh_c("square_100.txt");
+    //Mesh const mesh_c("square.txt");
+    bool ba = mesh_c.checkObtuseAngles();
+    if (ba) cout << "mesh corrected" << endl;
+
+    gMesh_Hierarchy ggm(mesh_c, nrefine);
+    const Mesh &mesh = ggm.finest();
+    //mesh.Debug();
+    //mesh.DebugEdgeBased();
+
+    FEM_Matrix SK(mesh);                   // CRS matrix
+    //SK.writeBinary("sparseMatrix.bin");
+    //SK.Debug();
+
+    vector<double> uv(SK.Nrows(), 0.0);    // temperature
+    vector<double> fv(SK.Nrows(), 0.0);    // r.h.s.
+
+    SK.CalculateLaplace(fv);                       // matrix
+    SK.CalculateRHS(fv, [](double x, double y) {   // rhs
+        return std::sin(M_PI * 2.5 * y) * (M_PI * M_PI * 2.5 * 2.5 * x * x - 2);
+    }
+                   );
+    //SK.CheckRowSum();
+    SK.CheckMatrix();
+    //SK.Debug();
+
+    // Two ways to initialize the vector
+    //mesh.SetValues(uv,f_zero);             // user function
+    //mesh.SetValues(uv, [](double x, double y) -> double {return 0.0*x*y;} );  // lambda function
+    //mesh.SetValues(uv, [](double x, double y) -> double {return 5e-3*(x+1)*(y+1);} );  // lambda function
+    //
+    mesh.SetValues(uv, [](double x, double y) -> double {
+        return x *x * std::sin(2.5 * M_PI * y);
+    } );
+
+    SK.ApplyDirichletBC(uv, fv);
+
+    auto exact_sol(uv);
+    //SK.Mult(fv,uv);
+
+    auto t3 = system_clock::now(); // start timer
+
+    JacobiSolve(SK, fv, uv );          // solve the system of equations
+
+    auto t4 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t4 - t3);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    cout << "JacobiSolve: timing in sec. : " << t_diff << endl;
+
+    auto [val, idx] = findLargestAbsError(exact_sol, uv, 1e+6, 100);
+
+    //mesh.Visualize(getAbsError(exact_sol, uv));
+    mesh.Visualize(uv);
+
+#else
+    // multigrid iteration
+    int nrefine = 3;
+    if (argc > 1)  nrefine = atoi(argv[1]);
+
+    //Multigrid ggm(Mesh("square_tiny.txt"),nrefine);
+    Multigrid ggm(Mesh("square_100.txt"), nrefine);
+
+    ggm.DefineOperators();
+
+    cout << "\n#############  SOLVE   ###############\n";
+
+    double tstart = omp_get_wtime();                  // OpenMP
+
+    //ggm.JacobiSolve(my_level);
+    //ggm.MG_Step(my_level, 1, true, 1);
+    ggm.MG_Solve(2, 1e-6, 1);
+
+    double t1 = omp_get_wtime() - tstart;             // OpenMP
+    cout << "MgSolve: timing in sec. : " << t1 << "   for " << ggm.Ndofs() << " dofs" << endl;
+
+
+    Test_solver(nrefine - 1, {1, 2, 4, 8, 16, 32, 64, 128, 256}, ggm);
+
+    //int my_level=nrefine-1;
+    //const auto &ml=ggm.GetMesh(my_level);
+    //const auto &sl=ggm.GetSolution(my_level);
+    //ml.Visualize(sl);
+    //////ml.Visualize_paraview(sl);
+    ////ml.Export_scicomp("level_"+to_string(my_level));
+
+    //int my_level=nrefine-1;
+    //const auto &mesh=ggm.GetMesh(my_level);
+    //const auto &uv=ggm.GetSolution(my_level);
+    //vector<double> exact_sol(size(uv));
+    //mesh.SetValues(exact_sol, [](double x, double y) -> double {
+    //return x * x * std::sin(2.5 * M_PI * y);
+    //} );
+    //mesh.Visualize(getAbsError(exact_sol, uv));
+
+#endif
+    return 0;
+}
+
+
+void Test_solver(int /*lev*/, vector<int> const &nthreads, Multigrid &ggm)
+{
+    cout << endl << endl << "-------------------------------------" << endl;
+    cout << "MgSolve: timing in sec. for " << ggm.Ndofs() << " dofs" << endl;
+    cout << "sec         threads" << endl;
+    vector<double> mg_time(size(nthreads), -1.0);
+
+    for (size_t k = 0; k < size(nthreads); ++k) {
+        omp_set_num_threads(nthreads.at(k));
+        double tstart = omp_get_wtime();
+        ggm.MG_Solve(2, 1e-6, 1);
+        double t1 = omp_get_wtime() - tstart;
+        mg_time.at(k) = t1;
+        cout << t1 << " : " << nthreads.at(k) << endl;
+    }
+}
--- a/mgrid_2/main.cpp.orig
+++ b/mgrid_2/main.cpp.orig
@ -0,0 +1,165 @@
+//		MPI code in C++.
+//		See [Gropp/Lusk/Skjellum, "Using MPI", p.33/41 etc.]
+//		and  /opt/mpich/include/mpi2c++/comm.h  for details
+
+#include "geom.h"
+#include "getmatrix.h"
+#include "jacsolve.h"
+#include "userset.h"
+#include "vdop.h"
+
+#include <cassert>
+#include <chrono>           // timing
+#include <cmath>
+#include <iostream>
+#include <omp.h>
+using namespace std;
+using namespace std::chrono;  // timing
+
+
+void Test_solver(int lev, vector<int> const & nthreads, Multigrid& ggm);
+
+int main(int argc , char **argv )
+{
+#undef MG
+#ifndef MG
+                                 // Jacobi iteration
+        int nrefine = 0;
+        if (argc>1)  nrefine = atoi(argv[1]);
+        //Mesh const mesh("square_tiny.txt");
+        //Mesh const mesh("square_100.txt");
+        //Mesh const mesh("L_shape.txt");
+
+        //Mesh const mesh_c("square_tiny.txt");
+        Mesh const mesh_c("square_100.txt");
+        //Mesh const mesh_c("square.txt");
+        bool ba = mesh_c.checkObtuseAngles();
+        if (ba) cout << "mesh corrected" << endl;
+        //mesh_c.Debug();
+        //mesh_c.DebugEdgeBased();
+
+        //RefinedMesh mesh(mesh_c);                  // OK, works
+        //Mesh const mesh("square_tiny.txt");
+        ////mesh.Debug();
+        //mesh.RefineAllElements(nrefine);           // OK, works
+
+        gMesh_Hierarchy ggm(mesh_c,nrefine);
+        const Mesh& mesh=ggm.finest();
+
+        //mesh.Debug();
+        //mesh.DebugEdgeBased();
+
+
+        FEM_Matrix SK(mesh);                   // CRS matrix
+        //SK.writeBinary("sparseMatrix.bin");
+        //SK.Debug();
+
+        vector<double> uv(SK.Nrows(),0.0);     // temperature
+        vector<double> fv(SK.Nrows(),0.0);     // r.h.s.
+
+        SK.CalculateLaplace(fv);                       // matrix
+        SK.CalculateRHS(fv, [](double x, double y)     // rhs
+                        {return std::sin(M_PI*2.5*y)*(M_PI*M_PI*2.5*2.5*x*x - 2);}
+                       );
+        //SK.CheckRowSum();
+        SK.CheckMatrix();
+        //return 0;
+        //SK.Debug();
+
+        //mesh.SetU(uv);         // deprecated
+        // Two ways to initialize the vector
+        //mesh.SetValues(uv,f_zero);             // user function
+        //mesh.SetValues(uv, [](double x, double y) -> double {return 0.0*x*y;} );  // lambda function
+        //mesh.SetValues(uv, [](double x, double y) -> double {return 5e-3*(x+1)*(y+1);} );  // lambda function
+        //
+        mesh.SetValues(uv, [](double x, double y) -> double {
+            return x * x * std::sin(2.5 * M_PI * y);
+            } );
+
+        SK.ApplyDirichletBC(uv,fv);
+
+        //SK.Compare2Old(nnode, id, ik, sk);
+        //SK.Debug();
+        auto exact_sol(uv);
+
+        //SK.Mult(fv,uv);
+
+        auto t3 = system_clock::now(); // start timer
+
+        JacobiSolve(SK, fv, uv );          // solve the system of equations
+
+        auto t4 = system_clock::now();  // stop timer
+        auto duration = duration_cast<microseconds>(t4 - t3);        // duration in microseconds
+        double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+        cout << "JacobiSolve: timing in sec. : " << t_diff << endl;
+
+        auto [val,idx] = findLargestAbsError(exact_sol, uv, 1e+6, 100);
+
+        //mesh.Visualize(getAbsError(exact_sol, uv));
+
+
+        //mesh.Write_ascii_matlab("uv.txt", uv);
+        mesh.Visualize(uv);
+    
+#else
+                                      // multigrid iteration
+        int nrefine = 3;
+        if (argc>1)  nrefine = atoi(argv[1]);
+
+        //Multigrid ggm(Mesh("square_tiny.txt"),nrefine);
+        Multigrid ggm(Mesh("square_100.txt"),nrefine);
+
+        ggm.DefineOperators();
+
+        cout << "\n#############  SOLVE   ###############\n";
+
+        double tstart = omp_get_wtime();                  // OpenMP
+
+        //ggm.JacobiSolve(my_level);
+        //ggm.MG_Step(my_level, 1, true, 1);
+        ggm.MG_Solve(2, 1e-6, 1);
+
+        double t1 = omp_get_wtime() - tstart;             // OpenMP
+        cout << "MgSolve: timing in sec. : " << t1 << "   for " << ggm.Ndofs()<< " dofs"<< endl;
+
+
+        Test_solver(nrefine-1, {1,2,4,8,16,32,64,128,256}, ggm);
+
+        //int my_level=nrefine-1;
+        //const auto &ml=ggm.GetMesh(my_level);
+        //const auto &sl=ggm.GetSolution(my_level);
+        //ml.Visualize(sl);
+        //////ml.Visualize_paraview(sl);
+        ////ml.Export_scicomp("level_"+to_string(my_level));
+
+        //int my_level=nrefine-1;
+        //const auto &mesh=ggm.GetMesh(my_level);
+        //const auto &uv=ggm.GetSolution(my_level);
+        //vector<double> exact_sol(size(uv));
+        //mesh.SetValues(exact_sol, [](double x, double y) -> double {
+            //return x * x * std::sin(2.5 * M_PI * y);
+            //} );
+        //mesh.Visualize(getAbsError(exact_sol, uv));
+    
+#endif
+    return 0;
+}
+
+
+void Test_solver(int /*lev*/, vector<int> const & nthreads, Multigrid& ggm)
+{
+    cout << endl << endl << "-------------------------------------" << endl;
+    cout << "MgSolve: timing in sec. for " << ggm.Ndofs()<< " dofs"<< endl;
+    cout << "sec         threads" << endl;
+    vector<double> mg_time(size(nthreads),-1.0);
+
+    for (size_t k=0; k<size(nthreads); ++k)
+    {
+        omp_set_num_threads(nthreads.at(k));
+        double tstart = omp_get_wtime();
+        ggm.MG_Solve(2, 1e-6, 1);
+        double t1 = omp_get_wtime() - tstart;
+        mg_time.at(k) = t1;
+        cout << t1 << " : " << nthreads.at(k) << endl;
+    }
+}
--- a/mgrid_2/numresults.pdf
+++ b/mgrid_2/numresults.pdf
--- a/mgrid_2/square_06.txt
+++ b/mgrid_2/square_06.txt
@ -0,0 +1,30 @@
+9
+2
+8
+3
+0	0
+1	0
+1	1
+0	1
+0.5	0
+1	0.5
+0.5	1
+0	0.5
+0.5	0.5
+8	1	9
+5	2	9
+6	3	9
+7	4	9
+1	5	9
+2	6	9
+3	7	9
+4	8	9
+8
+1	5
+5	2
+2	6
+6	3
+3	7
+7	4
+4	8
+8	1
--- a/mgrid_2/square_100.txt
+++ b/mgrid_2/square_100.txt
--- a/mgrid_2/square_tiny.txt
+++ b/mgrid_2/square_tiny.txt
@ -0,0 +1,95 @@
+13
+2
+16
+3
+0
+0
+1
+0
+1
+1
+0
+1
+0.5
+0
+1
+0.5
+0.5
+1
+0
+0.5
+0.4999999999999999
+0.4999999999999999
+0.3333333333333333
+0.6666666666666666
+0.6666666666666666
+0.6666666666666666
+0.6666666666666666
+0.3333333333333333
+0.3333333333333333
+0.3333333333333333
+8
+1
+13
+5
+2
+12
+6
+3
+11
+7
+4
+10
+1
+5
+13
+10
+8
+13
+2
+6
+12
+3
+7
+11
+4
+8
+10
+12
+9
+13
+10
+9
+11
+7
+10
+11
+11
+9
+12
+6
+11
+12
+9
+10
+13
+5
+12
+13
+8
+1
+5
+5
+2
+2
+6
+6
+3
+3
+7
+7
+4
+4
+8
+8
+1
--- a/mgrid_2/tet_elem.m
+++ b/mgrid_2/tet_elem.m
@ -0,0 +1,29 @@
+%% 3D: P1 element matrix tetrahedral element
+
+x = sym('x', [4 1]);
+y = sym('y', [4 1]);
+z = sym('z', [4 1]);
+
+A = [ones(4,1), x , y, z ];
+
+detA = det(A);
+Ainv = inv(A);
+T = Ainv*detA;      %  T/detA == Ainv
+
+grad_phi1=T(2:4,1)  %  /detA
+grad_phi2=T(2:4,2)  %  /detA
+grad_phi3=T(2:4,3)  %  /detA
+grad_phi4=T(2:4,4)  %  /detA
+
+%% Laplace 3D
+%   - laplace u = rhs
+clear
+syms x y z
+% u(x,y,z) = x^2*sin(pi*y)*cos(pi*z)
+u(x,y,z) = (x^2*(2*x - 3))*cos(pi*y)*cos(pi*z)
+
+gu(x,y,z) = simplify(gradient(u));
+% rhs = -diff(u,2)
+% rhs = -divergence(gradient(u))
+rhs = -laplacian(u,[x,y,z]);
+rhs(x,y,z) = simplify(rhs)
--- a/mgrid_2/userset.cpp
+++ b/mgrid_2/userset.cpp
@ -0,0 +1,17 @@
+#include "userset.h"
+#include <cmath>
+
+
+double FunctF(double const x , double const y)
+{
+// return  std::sin(3.14159*1*x)*std::sin(3.14159*1*y);
+//  return 16.0*1024. ;
+// return (double)1.0 ;
+    //return x * x * std::sin(2.5 * 3.14159 * y);
+    return x * x * std::sin(2.5 * M_PI * y);
+}
+
+double FunctU(const double /* x */, double const /* y */)
+{
+    return 1.0 ;
+}
--- a/mgrid_2/userset.h
+++ b/mgrid_2/userset.h
@ -0,0 +1,119 @@
+#pragma once
+#include <cmath>
+/**
+ * User function: f(@p x,@p y)
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value for right hand side f(@p x,@p y)
+ */
+double FunctF(double x, double y);
+
+/**
+ * User function: u(@p x,@p y)
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value for solution vector u(@p x,@p y)
+ */
+double FunctU(double x, double y);
+
+
+/**
+ * User function: f(@p x,@p y) = @f$ x^2 \sin(2.5\pi y)@f$.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value f(@p x,@p y)
+ */
+inline
+double fNice(double const x, double const y)
+{
+    //return x * x * std::sin(2.5 * M_PI * y);               // solution u
+    return std::sin(M_PI*2.5*y)*(M_PI*M_PI*2.5*2.5*x*x - 2); // -Laplacian(u)
+}
+
+/**
+ * User function: f(@p x,@p y,@p z).
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @param[in] z		z-coordinate of discretization point
+ * @return  value f(@p x,@p y,@p z)
+ */
+inline
+double fNice(double const x, double const y, double const z)
+{
+    return std::sin(M_PI*2.5*y)*(M_PI*M_PI*2.5*2.5*x*x - 2)+z;
+}
+
+/**
+ * User function: f(@p x,@p y) = 0$.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value 0
+ */
+inline
+double f_zero(double const x, double const y)
+//double f_zero(double const  /*x*/, double const /*y*/)
+{
+    return 0.0 + 0.0*(x+y);
+}
+
+// ---------------------------------------------------------------------
+
+/**
+ * solution u(@p x,@p y) of -Laplacian.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value u(@p x,@p y)
+ * @see rhs_lap2
+ */
+inline
+double sol_lap2(double const x, double const y)
+{
+    return x * x * std::sin(2.5 * M_PI * y);
+}
+
+/**
+ * Right hand side f(@p x,@p y) of -Laplacian(u) with the appropriate function u.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value f(@p x,@p y)
+ * @see sol_lap2
+ */
+inline
+double rhs_lap2(double const x, double const y)
+{
+    return std::sin(M_PI*2.5*y)*(M_PI*M_PI*2.5*2.5*x*x - 2);
+}
+
+// ---------------------------------------------------------------------
+
+/**
+ * solution u(@p x,@p y,@p z) of -Laplacian.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @param[in] z		z-coordinate of discretization point
+ * @return  value u(@p x,@p y,@p z)
+ * @see rhs_lap3
+ */
+inline
+double sol_lap3(double const x, double const y, double const z)
+{
+    //return x*x*std::cos(M_PI*z)*std::sin(M_PI*y);
+    return (x*x*(2*x - 3))*std::cos(M_PI*z)*std::cos(M_PI*y);
+}
+
+/**
+ * Right hand side f(@p x,@p y,@p z) of -Laplacian(u) with the appropriate function u.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @param[in] z		z-coordinate of discretization point
+ * @return  value f(@p x,@p y,@p z)
+ * @see sol_lap3
+ */
+inline
+double rhs_lap3(double const x, double const y, double const z)
+{
+    //return 2.0*std::cos(M_PI*z)*std::sin(M_PI*y)*(M_PI*M_PI*x*x - 1);
+    return -2.0*std::cos(M_PI*z)*std::cos(M_PI*y)*(- 2*M_PI*M_PI*x*x*x + 3*M_PI*M_PI*x*x + 6*x - 3);
+}
+
+// ---------------------------------------------------------------------
--- a/mgrid_2/utils.h
+++ b/mgrid_2/utils.h
@ -0,0 +1,212 @@
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <iomanip>       // setw
+#include <iostream>
+#include <map>
+#include <numeric>
+#include <utility>        // swap()
+#include <vector>
+
+///**
+ //* Output of a vector of @p v.
+ //*
+ //* @param[in,out] s    output stream
+ //* @param[in]     v    vector
+ //* @return             output stream 
+ //*/
+//template <class T>
+//std::ostream& operator<<(std::ostream &s, const std::vector<T> &v)
+//{
+    //for (auto it: v) { std::cout << "  " << std::setw(5) << it; }
+    //return s;
+//}
+
+
+/**
+ * Determines the permutation vector for sorting @p v ascending with operator< .
+ *
+ * @param[in] v    vector to permute
+ * @return         index vector for permutation, 
+ *                 @p v[idx[0]] denotes the smallest element of original data.
+ */
+template <typename T>
+std::vector<int> sort_indexes(const std::vector<T> &v)
+{
+    // initialize original index locations
+    std::vector<int> idx(v.size());
+    iota(begin(idx),end(idx),0);
+
+    // sort indexes based on comparing values in v
+    sort(idx.begin(), idx.end(),
+           [&v](int i1, int i2) -> bool
+            { return v[i1] < v[i2]; }
+        );
+
+    return idx;
+}
+
+
+/**
+ * Determines the permutation vector for sorting @p v descending with operator> .
+ *
+ * @param[in] v    vector to permute
+ * @return         index vector for permutation, 
+ *                 @p v[idx[0]] denotes the smallest element of original data.
+ */
+template <typename T>
+std::vector<int> sort_indexes_desc(const std::vector<T> &v)
+{
+    // initialize original index locations
+    std::vector<int> idx(v.size());
+    iota(begin(idx),end(idx),0);
+
+    // sort indexes based on comparing values in v
+    sort(idx.begin(), idx.end(),
+           [&v](int i1, int i2) -> bool
+            { return v[i1] > v[i2]; }
+        );
+
+    return idx;
+}
+
+/**
+ * Generates the inverse permutation vector of @p idx.
+ *
+ * @param[in] idx  permutation vector
+ * @return         inverse permutation, 
+ */
+template <typename T>
+std::vector<T> inverse_indexes(const std::vector<T> &idx)
+{
+    static_assert(std::is_integral<T>(),"Vector elements have to be integral numbers.");
+    return sort_indexes(idx);
+}
+
+
+/**
+ * Resort elements of vector @p x according to permutation @p old2new.
+ *
+ * @param[in]     old2new    permutation vector
+ * @param[in,out] x          vector[]
+ */
+template <class T>
+void permute(std::vector<int> const& old2new, std::vector<T> & x)
+{
+    assert(x.size()==old2new.size());
+
+    auto old(x);
+    for (size_t k=0; k<old2new.size(); ++k)
+    {
+        const size_t &k_old = old2new[k];
+        x[k] = old[k_old];
+    }
+}
+
+/**
+ * Resort a vector @p x storing two entries per element 
+ * according to permutation @p old2new.
+ *
+ * @param[in]     old2new    permutation vector
+ * @param[in,out] x          vector[][2]
+ */
+template <class T>
+void permute_2(std::vector<int> const& old2new, std::vector<T> & x)
+{
+    assert(x.size()==2*old2new.size());
+
+    auto old(x);
+    for (size_t k=0; k<old2new.size(); ++k)
+    {
+        const size_t &k_old = old2new[k];
+        x[2*k  ] = old[2*k_old  ];
+        x[2*k+1] = old[2*k_old+1];
+    }
+}
+
+/**
+ * Changes the entries in @p x according to renumbering @p  old2new.
+ *
+ * @param[in]     old2new    renumbering vector
+ * @param[in,out] x          vector
+ */
+template <class T>
+void reNumberEntries(std::vector<T> const& old2new, std::vector<T> & x)
+{
+    static_assert(std::is_integral<T>(),"Vector elements have to be integral numbers.");
+    assert( *max_element(cbegin(x),cend(x)) < static_cast<int>(old2new.size()) );
+    auto old(x);
+    auto const n2o = inverse_indexes(old2new);
+    for (size_t k=0; k<x.size(); ++k)
+    {
+        T const k_old = old[k];
+        x[k] = n2o[k_old];        
+    }
+}
+
+/**
+ * Changes the entries in @p x according to renumbering @p  old2new.
+ *
+ * @param[in]     old2new    renumbering vector
+ * @param[in,out] x          vector
+ */
+template <class T>
+void reNumberEntries(std::vector<T> const& old2new, std::map<T,T> & x)
+{
+    static_assert(std::is_integral<T>(),"Vector elements have to be integral numbers.");
+    assert( max_element(cbegin(x),cend(x))->second < static_cast<int>(old2new.size()) );
+    auto old(x);
+    auto const n2o = inverse_indexes(old2new);   
+    for ( auto &[key,val] : x)
+    {
+        val = n2o[val];
+    }
+}
+
+/**
+ * Sort the two entries per element of vector @p x ascending. 
+ *
+ * @param[in] x          vector[][2]
+ */
+template <class T>
+void sortAscending_2(std::vector<T> & x)
+{
+    static_assert(std::is_integral<T>(),"Vector elements have to be integral numbers.");
+    //for (auto it=begin(x); it!=end(x); it += 2)    // general solution
+    //{
+        //sort(it,it+1);
+    //}
+    
+    for (size_t k=0; k<x.size(); k+=2)               // for only 2 entries
+    {
+        if (x[k]>x[k+1]) { std::swap(x[k],x[k+1]); }
+    }
+}
+
+
+#include <chrono>           // contains timing routines
+typedef std::chrono::system_clock Time;
+typedef std::chrono::milliseconds ms;
+typedef std::chrono::duration<double> dsec;
+
+class MyTimer
+{
+    public:
+    
+    MyTimer() : _tstart(Time::now()) {}
+    
+    void tic()
+    { _tstart = Time::now(); }
+    
+    auto toc()
+    {
+        dsec timed = Time::now() - _tstart;
+        return timed.count();
+    }
+    
+    
+    private:
+    std::chrono::time_point<Time> _tstart;
+    
+};
--- a/mgrid_2/uv.txt
+++ b/mgrid_2/uv.txt
--- a/mgrid_2/vdop.cpp
+++ b/mgrid_2/vdop.cpp
@ -0,0 +1,187 @@
+#include "vdop.h"
+#include <algorithm>
+#include <cassert>               // assert()
+#include <cmath>
+#include <iostream>
+#include <tuple>               // tuple
+#include <vector>
+using namespace std;
+
+
+void vddiv(vector<double> & x, vector<double> const& y,
+                               vector<double> const& z)
+{
+    assert( x.size()==y.size() && y.size()==z.size() );
+    size_t n = x.size();
+
+#pragma omp parallel for    
+    for (size_t k = 0; k < n; ++k)
+    {
+        x[k] = y[k] / z[k];
+    }
+    }
+
+//******************************************************************************
+
+void vdaxpy(std::vector<double> & x, std::vector<double> const& y,
+                       double alpha, std::vector<double> const& z )
+{
+    assert( x.size()==y.size() && y.size()==z.size() );
+    size_t n = x.size();
+
+//#pragma omp parallel for    
+#pragma omp for    
+    for (size_t k = 0; k < n; ++k)
+    {
+        x[k] = y[k] + alpha * z[k];
+    }
+    }
+//******************************************************************************
+
+double dscapr(std::vector<double> const& x, std::vector<double> const& y)
+{
+    assert( x.size()==y.size());
+    size_t n = x.size();
+
+    double    s = 0.0;
+#pragma omp parallel for reduction(+:s)
+    for (size_t k = 0; k < n; ++k)
+    {
+        s += x[k] * y[k];
+    }
+
+    return s;
+}
+
+
+void dscapr(std::vector<double> const& x, std::vector<double> const& y, double &s)
+{
+    assert( x.size()==y.size());
+    size_t n = x.size();
+
+    #pragma omp single
+    s = 0.0;
+    double s_local = 0.0;              // initialize local variable
+//#pragma omp parallel for reduction(+:s)
+#pragma omp for 
+    for (size_t k = 0; k < n; ++k)
+    {
+        s_local += x[k] * y[k];
+    }
+    
+    #pragma omp atomic update
+    s += s_local;
+    #pragma omp barrier
+
+    }
+
+
+//******************************************************************************
+void DebugVector(vector<double> const &v)
+{
+    cout << "\nVector  (nnode = " << v.size() << ")\n";
+    for (double j : v)
+    {
+        cout.setf(ios::right, ios::adjustfield);
+        cout << j << "   ";
+    }
+    cout << endl;
+}
+//******************************************************************************
+bool CompareVectors(vector<double> const& x, int const n, double const y[], double const eps)
+{
+    bool bn = (static_cast<int>(x.size())==n);
+    if (!bn)
+    {
+        cout << "#########   Error: " << "number of elements" << endl;
+    }
+    //bool bv = equal(x.cbegin(),x.cend(),y);
+    bool bv = equal(x.cbegin(),x.cend(),y,
+                          [eps](double a, double b) -> bool
+                          { return std::abs(a-b)<eps*(1.0+0.5*(std::abs(a)+ std::abs(b))); }
+    );
+    if (!bv)
+    {
+        assert(static_cast<int>(x.size())==n);
+        cout << "#########   Error: " << "values" << endl;
+    }
+    return bn && bv;
+}
+
+//******************************************************************************
+vector<double> getAbsError(vector<double> const& x, vector<double> const& y)
+{
+    assert(size(x)==size(y));
+    vector<double> err(size(x));
+    
+    for (size_t k=0; k<size(err); ++k)
+    {
+        //err[k] = std::abs( x[k]-y[k] );
+        err[k] = x[k]-y[k];
+    }
+    return err;
+}
+
+//******************************************************************************
+tuple<double, int>  findLargestAbsError(vector<double> const& x, vector<double> const& y,
+    double eps, int nlarge)
+{
+    vector<double> const err = getAbsError(x,y); 
+    int const nerr=err.size();
+    
+    if (nlarge>0)
+    {
+        auto idx = sort_indexes_desc(err);
+        for (int k=0; k<min(nlarge,nerr); ++k)
+        {
+            if ( err[idx[k]]>=eps )
+            {
+                //cout << "err[" << idx[k] << "] = " << err[idx[k]] << endl;
+                cout << "err[" << idx[k] << "] = " << err[idx[k]] 
+                << "     " << x[idx[k]] << "  vs.  " << y[idx[k]] << endl;
+            }
+        }
+    }
+    
+    auto ip = max_element(cbegin(err),cend(err));
+    return make_tuple( *ip, ip-cbegin(err) );
+}
+
+bool vectorsAreEqual(vector<double> const& x, vector<double> const& y, double eps, int nlarge)
+{
+    bool bb=size(x)==size(y);
+    bb = bb && equal( cbegin(x),cend(x), cbegin(y),
+    [eps](auto const a, auto const b)->bool 
+    { return std::abs(a-b) <= eps*(1.0+(std::abs(a)+std::abs(b))/2.0); } 
+    );
+    
+    if (!bb)
+    {
+        findLargestAbsError(x,y,eps,nlarge);
+    }
+    return bb;
+}
+
+
+
+
+
+
+//******************************************************************************
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/mgrid_2/vdop.h
+++ b/mgrid_2/vdop.h
@ -0,0 +1,116 @@
+#pragma once
+#include "utils.h"
+
+#include <iostream>
+#include <tuple>
+#include <vector>
+
+/** @brief  Element-wise vector divison x_k = y_k/z_k.
+ *
+ * @param[out] x  target vector
+ * @param[in]  y  source vector
+ * @param[in]  z  source vector
+ *
+ */
+void vddiv(std::vector<double> & x, std::vector<double> const& y,
+                                    std::vector<double> const& z);
+
+/** @brief  Element-wise daxpy operation x(k) = y(k) + alpha*z(k).
+ *
+ * @param[out] x  target vector
+ * @param[in]  y  source vector
+ * @param[in]  alpha  scalar
+ * @param[in]  z  source vector
+ *
+ */
+void vdaxpy(std::vector<double> & x, std::vector<double> const& y,
+                       double alpha, std::vector<double> const& z );
+
+
+/** @brief  Calculates the Euclidean inner product of two vectors.
+ *
+ * @param[in]  x vector
+ * @param[in]  y vector
+ * @return Euclidean inner product @f$\langle x,y \rangle@f$
+ *
+ */
+double dscapr(std::vector<double> const& x, std::vector<double> const& y);
+void dscapr(std::vector<double> const& x, std::vector<double> const& y, double &s);
+
+
+inline
+double L2_scapr(std::vector<double> const& x, std::vector<double> const& y)
+{
+    return dscapr(x,y)/static_cast<double>(x.size());
+}
+
+/**
+ * Print entries of a vector.
+ * @param[in] v	    vector values
+*/
+void DebugVector(std::vector<double> const &v);
+
+/** @brief  Compares an STL vector with POD vector.
+ *
+ * The accuracy criteria @f$ |x_k-y_k| < \varepsilon \left({1+0.5(|x_k|+|y_k|)}\right) @f$
+ * follows the book by
+ * <a href="https://www.springer.com/la/book/9783319446592">Stoyan/Baran</a>, p.8.
+ *
+ * @param[in]  x    STL vector
+ * @param[in]  n    length of POD vector
+ * @param[in]  y    POD vector
+ * @param[in]  eps  relative accuracy criteria (default := 0.0).
+ * @return true iff pairwise vector elements are relatively close to each other.
+ *
+ */
+bool CompareVectors(std::vector<double> const& x, int n, double const y[], double eps=0.0);
+
+
+/** Output operator for vector
+ * 	@param[in,out] s	output stream, e.g. @p cout
+ *  @param[in]     v    vector
+ *
+ *	@return    output stream
+*/
+template <class T>
+std::ostream& operator<<(std::ostream &s, std::vector<T> const &v)
+{
+    for (auto vp: v)
+    {
+        s << vp << "  ";
+    }
+    return s;
+}
+
+/** Calculate the absolute difference between @p x and @p y.
+ * 
+ * 	@param[in]     x    vector
+ *  @param[in]     y    vector
+ *
+ *	@return    absolute error vector
+*/
+std::vector<double> getAbsError(std::vector<double> const& x, std::vector<double> const& y);
+
+
+/** Find the component wise largest absolute difference between @p x and @p y.
+ *  Optional via @p eps and @p nlarge the index and the value of the @p nlarge 
+ *  error components will be printed to standard output.
+ * 
+ * 	@param[in]     x    vector
+ *  @param[in]     y    vector
+ *  @param[in]     eps  threshold
+ *  @param[in]     nlarge  number of largest components to print.
+ *
+ *	@return    (max. error value, its index)
+*/
+std::tuple<double, int>  findLargestAbsError(
+    std::vector<double> const& x, 
+    std::vector<double> const& y,
+    double eps=1e-6, int nlarge=0)           ;
+    
+    
+bool vectorsAreEqual(
+    std::vector<double> const& x, 
+    std::vector<double> const& y,
+    double eps=1e-6, int nlarge=0)           ;
+
--- a/mgrid_2/visualize_results.m
+++ b/mgrid_2/visualize_results.m
@ -0,0 +1,20 @@
+%% Visualize results
+%
+%   flatpak run org.octave.Octave <filename>
+%      or
+%   octave --no-window-system --no-gui  -qf <filename>
+%
+%      or
+%   matlab -nosplash <   <filename>
+
+clear all
+clc
+
+%%
+fname = 'uv.txt';
+
+[xc,ia,v] = ascii_read_meshvector(fname);
+
+h = trisurf(ia, xc(:,1), xc(:,2), v);
+
+waitfor(h)                     % wait for closing the figure