/*
 * This piece of code is written for the numerical tests in "Mathematical and computational
 * models of incompressible materials subject to shear". In order to run the program, a version
 * of deal.II (available on http://www.dealii.org/) need to be installed. The easiest way to run
 * it would be to copy this file to deal.II/examples/step-22 after installation, change "step-22"
 * in Makefile to this file name and then type "make run" in the command line.
 * Most part of code is written based on step-22 in deal.II tutorial
 * (http://www.dealii.org/developer/doxygen/deal.II/step_22.html). Tutorial step-22 has very detailed
 * comments explaining every step of the code, so repeated comments are omitted here.
 */


#include <deal.II/base/quadrature_lib.h>
#include <deal.II/base/logstream.h>
#include <deal.II/base/function.h>
#include <deal.II/base/utilities.h>

#include <deal.II/lac/block_vector.h>
#include <deal.II/lac/full_matrix.h>
#include <deal.II/lac/block_sparse_matrix.h>
#include <deal.II/lac/solver_cg.h>
#include <deal.II/lac/solver_minres.h>
#include <deal.II/lac/precondition.h>
#include <deal.II/lac/constraint_matrix.h>

#include <deal.II/grid/tria.h>
#include <deal.II/grid/grid_generator.h>
#include <deal.II/grid/tria_accessor.h>
#include <deal.II/grid/tria_iterator.h>
#include <deal.II/grid/tria_boundary_lib.h>
#include <deal.II/grid/grid_tools.h>
#include <deal.II/grid/grid_refinement.h>

#include <deal.II/dofs/dof_handler.h>
#include <deal.II/dofs/dof_renumbering.h>
#include <deal.II/dofs/dof_accessor.h>
#include <deal.II/dofs/dof_tools.h>

#include <deal.II/fe/fe_q.h>
#include <deal.II/fe/fe_dgq.h>
#include <deal.II/fe/fe_dgp.h>
#include <deal.II/fe/fe_system.h>
#include <deal.II/fe/fe_values.h>

#include <deal.II/numerics/vectors.h>
#include <deal.II/numerics/matrices.h>
#include <deal.II/numerics/data_out.h>
#include <deal.II/numerics/error_estimator.h>

#include <deal.II/lac/sparse_direct.h>
#include <deal.II/lac/sparse_ilu.h>
#include <deal.II/lac/lapack_full_matrix.h>
#include <deal.II/lac/householder.h>

#include <fstream>
#include <sstream>

const double PULL = 0.1;
const bool MY_DEBUG = false;

namespace Shear
{
using namespace dealii;

template <int dim>
struct InnerPreconditioner;
template <>
struct InnerPreconditioner<2>
{
	typedef SparseDirectUMFPACK type;
};
template <>
struct InnerPreconditioner<3>
{
	typedef SparseILU<double> type;
};

// functions for constructing mass matrix
template <int dim>
void construct_dW_dC(Tensor<2,dim> &dW_dC, Tensor<2,dim> &Identity, Tensor<2,dim> &C, Tensor<2,dim> &C_inverse, double J, double mu)
{
	Tensor<2,dim> first_term;
	first_term.clear ();

	for (int beta=0; beta<dim; ++beta)
	{
		for (int alpha=0; alpha<dim; ++alpha)
		{
			for (int gamma=0; gamma<dim; ++gamma)
			{
				first_term[alpha][beta] += C_inverse[beta][alpha]*C[gamma][gamma];
			}
		}
	}
	dW_dC = 0.5 * mu * pow(J,-2.0/3.0) * (-1.0/3.0*first_term + Identity);
}

template <int dim>
void rank2TensorMultNoContraction( Tensor<4,dim> &result,
		Tensor<2,dim> &A, Tensor<2,dim> &B)
{
	for (int alpha = 0; alpha < dim; ++alpha)
	{
		for (int beta = 0; beta < dim; ++beta)
		{
			for (int gamma = 0; gamma < dim; ++gamma)
			{
				for (int epsilon = 0; epsilon < dim; ++epsilon)
				{
					result[alpha][beta][gamma][epsilon]=A[alpha][beta]*B[gamma][epsilon];
				}
			}
		}
	}

}
template <int dim>
void construct_C_ISO( Tensor<4,dim> &C_ISO, Tensor<2,dim> &C, Tensor<2,dim> &C_inverse, Tensor<2,dim> &Identity, double J, double mu)
{
	Tensor<4,dim> first_term;
	first_term.clear ();
	Tensor<4,dim> second_term;
	second_term.clear ();
	Tensor<4,dim> third_term;
	third_term.clear ();
	Tensor<4,dim> fourth_term;
	fourth_term.clear ();

	for (int epsilon = 0; epsilon<dim; ++epsilon)
	{
		for (int gamma = 0; gamma<dim; ++gamma)
		{
			for (int beta=0;beta<dim;++beta)
			{
				for (int alpha = 0; alpha<dim; ++alpha)
				{
					for (int theta=0; theta<dim; ++theta)
					{
						first_term[epsilon][gamma][beta][alpha]+=C_inverse[epsilon][gamma]*C_inverse[beta][alpha]*C[theta][theta];
					}
				}
			}
		}
	}
	first_term = first_term *(1.0/9.0)*pow(J,-2.0/3.0)*mu;

	for (int epsilon = 0; epsilon<dim; ++epsilon)
	{
		for (int alpha = 0; alpha<dim; ++alpha)
		{
			for (int beta=0;beta<dim;++beta)
			{
				for (int gamma = 0; gamma<dim; ++gamma)
				{
					for (int theta=0; theta<dim; ++theta)
					{
						second_term[epsilon][alpha][beta][gamma]+=
								(C_inverse[beta][gamma]*C_inverse[epsilon][alpha]+C_inverse[beta][epsilon]*C_inverse[gamma][alpha])*C[theta][theta];
					}
				}
			}
		}
	}
	second_term = second_term*(1.0/6.0)*pow(J,-2.0/3.0)*mu;

	rank2TensorMultNoContraction(third_term,C_inverse,Identity);
	third_term = -(1.0/3.0)*pow(J,-2.0/3.0)*mu*third_term;
	rank2TensorMultNoContraction(fourth_term,C_inverse,Identity);
	fourth_term = -(1.0/3.0)*pow(J,-2.0/3.0)*mu*fourth_term;

	for (int alpha = 0; alpha<dim; ++alpha)
	{
		for (int beta=0;beta<dim;++beta)
		{
			for (int gamma = 0; gamma<dim; ++gamma)
			{
				for (int epsilon = 0; epsilon<dim; ++epsilon)
				{
					C_ISO[alpha][beta][gamma][epsilon]=first_term[epsilon][gamma][beta][alpha]
					                                                                    +second_term[epsilon][alpha][beta][gamma]+third_term[beta][alpha][gamma][epsilon]
					                                                                                                                                             +fourth_term[epsilon][gamma][alpha][beta];
				}
			}
		}
	}
}

template <int dim>
void construct_C_VOL( Tensor<4,dim> &C_VOL, Tensor<2,dim> &C_inverse, const double p, double J)
{
	Tensor<4,dim> first_term;
	rank2TensorMultNoContraction(first_term,C_inverse,C_inverse);
	Tensor<4,dim> second_term;
	rank2TensorMultNoContraction(second_term,C_inverse,C_inverse);

	for (int alpha = 0; alpha < dim; ++alpha)
	{
		for (int beta = 0; beta < dim; ++beta)
		{
			for (int gamma = 0; gamma < dim; ++gamma)
			{
				for (int epsilon = 0; epsilon < dim; ++epsilon)
				{
					C_VOL[alpha][beta][gamma][epsilon] =
							p*J*(0.5*first_term[epsilon][gamma][beta][alpha]-
									0.5*(second_term[beta][gamma][epsilon][alpha]+second_term[beta][epsilon][gamma][alpha]));
				}
			}
		}
	}
}

template <int dim>
void construct_prev_step_info(Tensor<4,dim> &prev_step_info, Tensor<4,dim> &first_term, Tensor<4,dim> &second_term,
		Tensor<4,dim> &C_ISO, Tensor<4,dim> &C_VOL, Tensor<2,dim> &F,
		Tensor<2,dim> &Identity, Tensor<4,dim> &second_term_part)
{
	Tensor<4,dim> C_sum = C_ISO + C_VOL;
	Tensor<4,dim> first_term_part;
	first_term_part.clear ();
	first_term.clear ();
	second_term.clear ();
	prev_step_info.clear ();

	for (int alpha = 0; alpha < dim; ++alpha)
	{
		for (int beta = 0; beta < dim; ++beta)
		{
			for (int gamma = 0; gamma < dim; ++gamma)
			{
				for (int i = 0; i < dim; ++i)
				{
					for (int epsilon = 0; epsilon < dim; ++epsilon)
					{
						first_term_part[alpha][beta][gamma][i] +=
								C_sum[alpha][beta][gamma][epsilon] * F[i][epsilon];
					}
				}
			}
		}
	}

	for (int alpha = 0; alpha < dim; ++alpha)
	{
		for (int gamma = 0; gamma < dim; ++gamma)
		{
			for (int i = 0; i < dim; ++i)
			{
				for (int m = 0; m < dim; ++m)
				{
					for (int beta = 0; beta < dim; ++beta)
					{
						first_term[alpha][gamma][i][m] +=
								first_term_part[alpha][beta][gamma][i] * F[m][beta];
					}
				}
			}
		}
	}
	for (int gamma = 0; gamma < dim; ++gamma)
	{
		for (int alpha = 0; alpha < dim; ++alpha)
		{
			for (int i = 0; i < dim; ++i)
			{
				for (int m = 0; m < dim; ++m)
				{
					for (int beta = 0; beta < dim; ++beta)
					{
						second_term[gamma][alpha][i][m] +=
								Identity[gamma][beta]*second_term_part[alpha][beta][i][m];
					}
				}
			}
		}
	}

	for (int alpha = 0; alpha < dim; ++alpha)
	{
		for (int gamma = 0; gamma < dim; ++gamma)
		{
			for (int i = 0; i < dim; ++i)
			{
				for (int m = 0; m < dim; ++m)
				{
					prev_step_info[alpha][gamma][i][m] =
							2.0*first_term[alpha][gamma][i][m] + second_term[gamma][alpha][i][m];
				}
			}
		}
	}

}
template <int dim>
double construct_A11( Tensor<2,dim> &grad_phi_x_j, Tensor<4,dim> &prev_step_info, Tensor<2,dim> &grad_phi_x_i)
{
	double A11 = 0.0;

	Tensor<2,dim> first_step;
	first_step.clear ();
	for (int alpha = 0; alpha < dim; ++alpha)
	{
		for (int m = 0; m < dim; ++m)
		{
			for (int i = 0; i < dim; ++i)
			{
				for (int gamma = 0; gamma < dim; ++gamma)
				{
					first_step[alpha][m] += grad_phi_x_j[i][gamma] * prev_step_info[alpha][gamma][i][m];
				}
			}
		}
	}
	for (int alpha = 0; alpha < dim; ++alpha)
	{
		for (int m = 0; m < dim; ++m)
		{
			A11 += first_step[alpha][m] * grad_phi_x_i[m][alpha];
		}
	}
	return A11;
}


template <int dim>
double construct_A12_21(double phi_p, Tensor<2,dim> &F_inverse, Tensor<2,dim> &grad_phi_x, double J)
{
	double A12_21 = 0.0;
	for (int alpha = 0; alpha < dim; ++alpha)
	{
		for (int i = 0; i < dim; ++i)
		{
			A12_21 += F_inverse[alpha][i] * grad_phi_x[i][alpha];
		}
	}
	return (phi_p*J*A12_21);
}

template <int dim>
double construct_rhs_x(Tensor<2,dim> &intermediate, Tensor<2,dim> &delta_E)
{
	double rhs_x = 0.0;
	for (int alpha = 0; alpha < dim; ++alpha)
	{
		for (int beta = 0; beta < dim; ++beta)
		{
			rhs_x += intermediate[alpha][beta] * delta_E[alpha][beta];
		}
	}
	return (-rhs_x);
}

template <int dim>
class StokesProblem
{
public:
	StokesProblem(const unsigned int degree);
	void run();

private:
	void setup_dofs();
	void assemble_system();
	void output_results(const unsigned int iter) const;
	void output_cumulative_results(const unsigned int iter) const;
	void refine_mesh();

	const unsigned int degree;

	Triangulation<dim> triangulation;
	FESystem<dim> fe;
	DoFHandler<dim> dof_handler;

	ConstraintMatrix constraints_1st_iter; //boundary constraints for the very first Newton iteration
										   //which is the boundary condition described in the paper
	ConstraintMatrix constraints;	//boundary constraints for the rest of the iterations
									//which is 0 on both top and bottom
	BlockSparsityPattern sparsity_pattern;
	BlockSparseMatrix<double> system_matrix;

	BlockVector<double> solution;
	BlockVector<double> c_solution;	//cumulative solution
	BlockVector<double> system_rhs;
	BlockVector<double> res_of_LS;	//residual of least square

	std_cxx1x::shared_ptr<typename InnerPreconditioner<dim>::type> A_preconditioner;
	unsigned int n_u, n_p;
};

template <int dim>
class BoundaryValues : public Function<dim>
{
public:
	BoundaryValues() : Function<dim>(dim+1){}
	virtual double value (const Point<dim> &p,
			const unsigned int component=0) const { return 0; }

	virtual void vector_value (Vector<double> &value) const;
};

template <int dim>
void BoundaryValues<dim>::vector_value (Vector<double> &values) const
{
	for (unsigned int c=0; c<this->n_components;++c)
	{
		values(c) = 0;
	}
}

template <int dim>
class BoundaryValues_1st_iter : public Function<dim>
{
public:
	BoundaryValues_1st_iter() : Function<dim>(dim+1){}

	virtual double value (const Point<dim> &p,
			const unsigned int component=0) const;

	virtual void vector_value (const Point<dim> &p,
			Vector<double> &value) const;
};

template <int dim>
double BoundaryValues_1st_iter<dim>::value(const Point<dim> &p, const unsigned int component)const
{
	Assert (component < this->n_components, ExcIndexRange(component, 0, this->n_components));
	if (component == 0)
	{
		if (p[2] == 0)
			return 0;
		if (p[2] == 1)
			return PULL;
		else
			return 0;
	}
	else
		return 0;
}

template <int dim>
void BoundaryValues_1st_iter<dim>::vector_value (const Point<dim> &p, Vector<double> &values) const
{
	for (unsigned int c=0; c<this->n_components;++c)
	{
		values(c) = BoundaryValues_1st_iter<dim>::value(p,c);
	}
}


template <class Matrix, class Preconditioner>
class InverseMatrix : public Subscriptor
{
public:
	InverseMatrix (const Matrix &m,
			const Preconditioner &preconditioner);
	void vmult (Vector<double> &dst,
			const Vector<double> &src) const;
private:
	const SmartPointer<const Matrix> matrix;
	const SmartPointer<const Preconditioner> preconditioner;
};
template <class Matrix, class Preconditioner>
InverseMatrix<Matrix,Preconditioner>::InverseMatrix (const Matrix &m,
		const Preconditioner &preconditioner)
		:
		matrix (&m),
		preconditioner (&preconditioner)
		{}
template <class Matrix, class Preconditioner>
void InverseMatrix<Matrix,Preconditioner>::vmult (Vector<double> &dst,
		const Vector<double> &src) const
		{
	SolverControl solver_control (src.size(), 1e-6*src.l2_norm());
	SolverCG<> cg (solver_control);
	dst = 0;
	cg.solve (*matrix, dst, src, *preconditioner);
		}
template <class Preconditioner>
class SchurComplement : public Subscriptor
{
public:
	SchurComplement (const BlockSparseMatrix<double> &system_matrix,
			const InverseMatrix<SparseMatrix<double>, Preconditioner> &A_inverse);
	void vmult (Vector<double> &dst,
			const Vector<double> &src) const;
private:
	const SmartPointer<const BlockSparseMatrix<double> > system_matrix;
	const SmartPointer<const InverseMatrix<SparseMatrix<double>, Preconditioner> > A_inverse;
	mutable Vector<double> tmp1, tmp2;
};
template <class Preconditioner>
SchurComplement<Preconditioner>::
SchurComplement (const BlockSparseMatrix<double> &system_matrix,
		const InverseMatrix<SparseMatrix<double>,Preconditioner> &A_inverse)
		:
		system_matrix (&system_matrix),
		A_inverse (&A_inverse),
		tmp1 (system_matrix.block(0,0).m()),
		tmp2 (system_matrix.block(0,0).m())
		{}
template <class Preconditioner>
void SchurComplement<Preconditioner>::vmult (Vector<double> &dst,
		const Vector<double> &src) const
		{
	system_matrix->block(0,1).vmult (tmp1, src);
	A_inverse->vmult (tmp2, tmp1);
	system_matrix->block(1,0).vmult (dst, tmp2);
		}


//Q2-Q1 element
template <int dim>
StokesProblem<dim>::StokesProblem (const unsigned int degree):
degree(degree),
triangulation(Triangulation<dim>::maximum_smoothing),
fe (FE_Q<dim>(2),dim,
		FE_Q<dim>(1),1),
		dof_handler(triangulation)
		{}

template <int dim>
void StokesProblem<dim>::setup_dofs ()
{
	A_preconditioner.reset ();
	system_matrix.clear();

	dof_handler.distribute_dofs(fe);
	DoFRenumbering::Cuthill_McKee(dof_handler);

	std::vector<unsigned int> block_component (dim+1,0);
	block_component[dim]=1;
	DoFRenumbering::component_wise(dof_handler, block_component);
	{
		constraints.clear();
		std::vector<bool> component_mask (dim+1, true);
		component_mask[dim] = false;
		DoFTools::make_hanging_node_constraints (dof_handler,constraints);

		VectorTools::interpolate_boundary_values(dof_handler,1,BoundaryValues<dim>(),constraints,component_mask);
	}
	constraints.close();

	{
		constraints_1st_iter.clear();
		std::vector<bool> component_mask (dim+1, true);
		component_mask[dim] = false;
		DoFTools::make_hanging_node_constraints (dof_handler,constraints_1st_iter);
		VectorTools::interpolate_boundary_values(dof_handler,1,BoundaryValues_1st_iter<dim>(),constraints_1st_iter,component_mask);
	}
	constraints_1st_iter.close();

	std::vector<unsigned int> dofs_per_block(2);
	DoFTools::count_dofs_per_block (dof_handler,dofs_per_block,block_component);
	n_u = dofs_per_block[0];
	n_p = dofs_per_block[1];

	std::cout<<"% Number of active cells:"
			<< triangulation.n_active_cells()
			<< std::endl
			<< "% Number of degrees of freedom: "
			<< dof_handler.n_dofs()
			<< " (" << n_u <<'+'<<n_p<<')'
			<< std::endl;

	{
		BlockCompressedSimpleSparsityPattern csp (2,2);

		csp.block(0,0).reinit (n_u, n_u);
		csp.block(1,0).reinit (n_p, n_u);
		csp.block(0,1).reinit (n_u, n_p);
		csp.block(1,1).reinit (n_p, n_p);

		csp.collect_sizes();

		DoFTools::make_sparsity_pattern (dof_handler, csp, constraints_1st_iter, false);
		sparsity_pattern.copy_from (csp);
	}

	system_matrix.reinit (sparsity_pattern);

	solution.reinit(2);
	solution.block(0).reinit(n_u);
	solution.block(1).reinit(n_p);
	solution.collect_sizes();
	solution = 0;

	c_solution.reinit(2);
	c_solution.block(0).reinit(n_u);
	c_solution.block(1).reinit(n_p);
	c_solution.collect_sizes();
	c_solution = 0;

	system_rhs.reinit(2);
	system_rhs.block(0).reinit(n_u);
	system_rhs.block(1).reinit(n_p);
	system_rhs.collect_sizes();

	res_of_LS.reinit(2);
	res_of_LS.block(0).reinit(n_u);
	res_of_LS.block(1).reinit(n_p);
	res_of_LS.collect_sizes();
}

template <int dim>
void StokesProblem<dim>::assemble_system ()
{
	const double mu = 1.0;
	system_matrix=0;
	system_rhs=0;

	QGauss<dim> quadrature_formula(degree+2);

	FEValues<dim> fe_values (fe, quadrature_formula,
			update_values	|
			update_quadrature_points	|
			update_JxW_values	|
			update_gradients);

	const unsigned int dofs_per_cell = fe.dofs_per_cell;
	const unsigned int n_q_points = quadrature_formula.size();

	FullMatrix<double> local_matrix(dofs_per_cell, dofs_per_cell);
	Vector<double> local_rhs(dofs_per_cell);

	std::vector<unsigned int> local_dof_indices(dofs_per_cell);

	const FEValuesExtractors::Vector position(0);
	const FEValuesExtractors::Scalar L_multiplier(dim);

	int ind;
	int num = dof_handler.n_dofs();

	std::vector<Tensor<2,dim> > grad_phi_x(dofs_per_cell);
	std::vector<double> phi_p(dofs_per_cell);
	std::vector<double> div_phi_x(dofs_per_cell);
	double p;

	Tensor<2,dim> F;
	Tensor<2,dim> F_inverse;
	Tensor<2,dim> F_transpose;
	Tensor<4,dim> prev_step_info;
	Tensor<2,dim> C;
	Tensor<2,dim> C_inverse;
	Tensor<2,dim> dW_dC;
	Tensor<2,dim> intermediate;
	Tensor<4,dim> C_ISO;
	Tensor<4,dim> C_VOL;
	Tensor<4,dim> part_of_second_term;

	Vector<double> J(n_q_points);
	Vector<double> energy(n_q_points);
	double c_J;	//J over the entire cube
	double c_energy;//energy over the entire cube
	std::vector< Point <dim> > pts;
	double I_1;
	double A11;
	double A12;
	double A21;

	Tensor<2,dim> E_first_term;
	Tensor<2,dim> E_second_term;
	Tensor<2,dim> delta_E;
	double rhs_x;
	double rhs_p;

	double tol = pow(10.0,-6.0); //tolerance

	Tensor<2,dim> Identity;	//identity tensor
	for (int i=0; i<dim; ++i)
	{
		for (int j=0; j<dim; ++j)
		{
			if (i == j)
			{
				Identity[i][j]=1;
			}
			else
			{
				Identity[i][j]=0;
			}
		}
	}

	int cnt;	//iteration count
	double norm_of_RHS;

	if (MY_DEBUG)
	{
		MappingQ1<dim> mapping;
		std::vector<Point<dim> > support_points(num);
		DoFTools::map_dofs_to_support_points (mapping, dof_handler, support_points);

		std::cout<<"Support =[";
		for (int i=0; i<n_u-1; i+=3)
		{
			std::cout<<support_points[i]<<";"<<std::endl;
		}
		std::cout<<"];"<<std::endl;
	}

	{
		cnt = 0;
		norm_of_RHS = 100000.0;
		solution = 0;

		while (norm_of_RHS > tol)
		{
			cnt++;
			if (MY_DEBUG)
				std::cout<<"% Iteration "<<cnt<<std::endl;

			typename DoFHandler<dim>::active_cell_iterator
			cell = dof_handler.begin_active(),
			endc = dof_handler.end();

			system_matrix=0;
			system_rhs=0;
			c_J = 0;
			c_energy = 0;

			for (; cell!=endc; ++cell) // for each element
			{
				fe_values.reinit(cell);

				local_matrix = 0;
				local_rhs = 0;
				cell->get_dof_indices(local_dof_indices);

				pts = fe_values.get_quadrature_points();

				for (unsigned int q=0; q<n_q_points; ++q) // for each quadrature point
				{
					F = Identity;	// value of tensor F at point q
									// F = I before deformation
					p = 0;			// value of p at point q

					for (unsigned int k=0; k<dofs_per_cell; ++k) // for each degree of freedom
					{
						grad_phi_x[k] = fe_values[position].gradient(k,q);
						phi_p[k] = fe_values[L_multiplier].value(k,q);
						div_phi_x[k] = fe_values[position].divergence(k,q);

						// find the corresponding value of cumulative solution
						ind = local_dof_indices[k];
						Vector<double> c_sol(num);
						c_sol = c_solution;
						double c_sol_corres_value;
						c_sol_corres_value = c_sol(ind);

						F += c_sol_corres_value * grad_phi_x[k]; // F = I+du/dX
						p += c_sol_corres_value * phi_p[k];
					}

					F_inverse = invert(F);
					F_transpose = transpose(F);
					J[q] = determinant(F);

					if (MY_DEBUG)
						std::cout<<"% q = "<<q<<", "<<pts[q]<<", J = "<<J[q]<<std::endl;
						// value of J at each quadrature point

					C = F_transpose * F;
					C_inverse = invert(C);
					I_1 = trace(C);
					energy[q] = 0.5*mu*(I_1-3)+p*(J[q]-1);

					construct_dW_dC(dW_dC, Identity, C, C_inverse, J[q], mu);
					construct_C_ISO(C_ISO, C, C_inverse,Identity,J[q], mu);	// C_ISO
					construct_C_VOL(C_VOL, C_inverse, p, J[q]);				// C_VOL

					// compute part of A11 that based on information of previous steps
					for (int alpha=0; alpha<dim; ++alpha)
					{
						for (int beta=0; beta<dim; ++beta)
						{
							intermediate[alpha][beta] = J[q]*p*C_inverse[beta][alpha]+2.0*dW_dC[alpha][beta];
						}
					}
					rank2TensorMultNoContraction(part_of_second_term, intermediate, Identity);
					Tensor<4,dim> first;
					Tensor<4,dim> second;
					construct_prev_step_info(prev_step_info,first,second,C_ISO,C_VOL,F,Identity,part_of_second_term);

					for (unsigned int i=0; i<dofs_per_cell; ++i)
					{
						for (unsigned int j=0; j<dofs_per_cell; ++j)
						{
							A11 = construct_A11(grad_phi_x[j], prev_step_info, grad_phi_x[i]);
							A12 = construct_A12_21(phi_p[j], F_inverse, grad_phi_x[i], J[q]);
							A21 = construct_A12_21(phi_p[i], F_inverse, grad_phi_x[j], J[q]);

							local_matrix(i,j) += (A11 + A12 + A21 + phi_p[i]*phi_p[j]) * fe_values.JxW(q);
						}

						//compute delta_E
						contract(E_first_term, grad_phi_x[i], 1, F, 1);
						contract(E_second_term, F, 1, grad_phi_x[i], 1);
						delta_E = 0.5*(E_first_term + E_second_term);

						//compute right-hand-side
						rhs_x = construct_rhs_x(intermediate, delta_E);
						rhs_p = -(J[q]-1) * phi_p[i];
						local_rhs(i) += (rhs_x + rhs_p)*fe_values.JxW(q);
					}
					c_J += J[q]*fe_values.JxW(q);
					c_energy += energy[q]*fe_values.JxW(q);
				}

				if (cnt ==1)
				{
					constraints_1st_iter.distribute_local_to_global (local_matrix, local_rhs, local_dof_indices,
							system_matrix, system_rhs);
				}
				else
				{
					constraints.distribute_local_to_global (local_matrix, local_rhs, local_dof_indices,
							system_matrix, system_rhs);
				}
			}

			A_preconditioner
			= std_cxx1x::shared_ptr<typename InnerPreconditioner<dim>::type>(new typename InnerPreconditioner<dim>::type());
			A_preconditioner->initialize (system_matrix.block(0,0),
					typename InnerPreconditioner<dim>::type::AdditionalData());

			norm_of_RHS = system_rhs.l2_norm();
			if (MY_DEBUG)
			{
				std::cout <<"% norm of RHS = "<< norm_of_RHS <<std::endl;
				std::cout <<"% c_J = "<<c_J<<std::endl;
				std::cout<< "% c_energy = "<<c_energy<<std::endl;
			}

			//start solving
			const InverseMatrix<SparseMatrix<double>,
			typename InnerPreconditioner<dim>::type>
			A_inverse (system_matrix.block(0,0), *A_preconditioner);
			Vector<double> tmp (solution.block(0).size());
			{
				Vector<double> schur_rhs (solution.block(1).size());
				A_inverse.vmult (tmp, system_rhs.block(0));
				system_matrix.block(1,0).vmult (schur_rhs, tmp);
				schur_rhs -= system_rhs.block(1);
				SchurComplement<typename InnerPreconditioner<dim>::type>
				schur_complement (system_matrix, A_inverse);
				SolverControl solver_control (solution.block(1).size(),
						1e-6*schur_rhs.l2_norm());
				SolverCG<> cg (solver_control);
				SparseILU<double> preconditioner;
				preconditioner.initialize (system_matrix.block(1,1),
						SparseILU<double>::AdditionalData());
				InverseMatrix<SparseMatrix<double>,SparseILU<double> >
				m_inverse (system_matrix.block(1,1), preconditioner);
				cg.solve (schur_complement, solution.block(1), schur_rhs,
						m_inverse);
				constraints.distribute (solution);
			}

			system_matrix.block(0,1).vmult (tmp, solution.block(1));
			tmp *= -1;
			tmp += system_rhs.block(0);
			A_inverse.vmult (solution.block(0), tmp);
			if (cnt == 1)
			{
				constraints_1st_iter.distribute (solution);
			}
			else
			{
				constraints.distribute (solution);
			}

			system_matrix.vmult(res_of_LS,solution);
			res_of_LS -= system_rhs;
			c_solution += solution;

			if ((MY_DEBUG)&&(norm_of_RHS <= tol))
			{
				int plot_size;
				plot_size = n_u/3;
				Vector<double> x_dir(plot_size);
				Vector<double> y_dir(plot_size);
				Vector<double> z_dir(plot_size);

				std::cout<<"C_solution = [";
				for (int m=0;m<plot_size;++m)
				{
					x_dir[m]=c_solution.block(0)[m*3];
					y_dir[m]=c_solution.block(0)[m*3+1];
					z_dir[m]=c_solution.block(0)[m*3+2];
					std::cout<<x_dir[m]<<" "<<y_dir[m]<<" "<<z_dir[m]<<";"<<std::endl;
				}
				std::cout<<"];"<<std::endl;
				std::cout<<"Pressure = ["<<c_solution.block(1)<<"];"<<std::endl;
			}
			output_results (cnt);
			output_cumulative_results(cnt);
		}
	}
}

template <int dim>
void
StokesProblem<dim>::output_results (const unsigned int iter) const
{
	std::vector<std::string> solution_names (dim, "position");
	solution_names.push_back ("L_multiplier");

	std::vector<DataComponentInterpretation::DataComponentInterpretation>
	data_component_interpretation
	(dim, DataComponentInterpretation::component_is_part_of_vector);
	data_component_interpretation
	.push_back (DataComponentInterpretation::component_is_scalar);

	DataOut<dim> data_out;
	data_out.attach_dof_handler (dof_handler);
	data_out.add_data_vector (solution, solution_names,
			DataOut<dim>::type_dof_data,
			data_component_interpretation);

	data_out.build_patches ();

	std::ostringstream filename;
	filename << "solution-"
			<< Utilities::int_to_string (iter, 4);

	std::ofstream output (filename.str().c_str());
	data_out.write_vtk (output);
}

template <int dim>
void
StokesProblem<dim>::output_cumulative_results (const unsigned int iter) const
{
	std::vector<std::string> solution_names (dim, "position");
	solution_names.push_back ("L_multiplier");

	std::vector<DataComponentInterpretation::DataComponentInterpretation>
	data_component_interpretation
	(dim, DataComponentInterpretation::component_is_part_of_vector);
	data_component_interpretation
	.push_back (DataComponentInterpretation::component_is_scalar);

	DataOut<dim> data_out;
	data_out.attach_dof_handler (dof_handler);
	data_out.add_data_vector (c_solution, solution_names,
			DataOut<dim>::type_dof_data,
			data_component_interpretation);

	data_out.build_patches ();

	std::ostringstream filename;
	filename << "c_solution-"
			<< Utilities::int_to_string (iter, 4);

	std::ofstream output (filename.str().c_str());
	data_out.write_vtk (output);
}

template <int dim>
void
StokesProblem<dim>::refine_mesh ()
{
	Vector<float> estimated_error_per_cell (triangulation.n_active_cells());

	std::vector<bool> component_mask (dim+1, false);
	component_mask[dim] = true;
	KellyErrorEstimator<dim>::estimate (dof_handler,
			QGauss<dim-1>(degree+1),
			typename FunctionMap<dim>::type(),
			solution,
			estimated_error_per_cell,
			component_mask);

	GridRefinement::refine_and_coarsen_fixed_number (triangulation,
			estimated_error_per_cell,
			0.3, 0.0);
	triangulation.execute_coarsening_and_refinement ();
}

template <int dim>
void StokesProblem<dim>::run ()
{
	std::vector<unsigned int> subdivisions (dim, 1);
	subdivisions[0] = 2; //number of elements in x1 direction
	subdivisions[1] = 2; //number of elements in x2 direction
	subdivisions[2] = 2; //number of elements in x3 direction

	const Point<dim> bottom_left = (dim == 2 ?
			Point<dim>(0,0) :
			Point<dim>(0,0,0));
	const Point<dim> top_right = (dim == 2 ?
			Point<dim>(1,1) :
			Point<dim>(1,1,1));

	GridGenerator::subdivided_hyper_rectangle (triangulation,
			subdivisions,
			bottom_left,
			top_right);

	for (typename Triangulation<dim>::active_cell_iterator
			cell = triangulation.begin_active();
			cell != triangulation.end(); ++cell)
	{
		for (unsigned int f=0; f<GeometryInfo<dim>::faces_per_cell; ++f)
		{
			if (cell->face(f)->center()[2] == 0)	//bottom
			{
				cell->face(f)->set_all_boundary_indicators(1);
			}
			if (cell->face(f)->center()[2] == 1)	//top
			{
				cell->face(f)->set_all_boundary_indicators(1);
			}
		}
	}

	triangulation.refine_global (0);
	setup_dofs ();
	assemble_system ();
}
}

int main ()
{
	try
	{
		using namespace dealii;
		using namespace Shear;
		{
			deallog.depth_console (0);

			StokesProblem<3> flow_problem(1);
			flow_problem.run ();
		}
	}
	catch (std::exception &exc)
	{
		std::cerr << std::endl << std::endl
				<< "----------------------------------------------------"
				<< std::endl;
		std::cerr << "Exception on processing: " << std::endl
				<< exc.what() << std::endl
				<< "Aborting!" << std::endl
				<< "----------------------------------------------------"
				<< std::endl;

		return 1;
	}
	catch (...)
	{
		std::cerr << std::endl << std::endl
				<< "----------------------------------------------------"
				<< std::endl;
		std::cerr << "Unknown exception!" << std::endl
				<< "Aborting!" << std::endl
				<< "----------------------------------------------------"
				<< std::endl;
		return 1;
	}

	return 0;
}





