/*******************************************************************************
* Copyright 2022 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

/*
*    In this example a system of linear equations, Ax = b, is solved by
* preconditioned BiCGSTAB method using forward SOR preconditioner.
*    Application of SOR preconditioner consist of several iterations of solving
* system z_{k+1} = M^{-1}*r_{k+1}, where M^{-1} = (sor_omega*L^{-1} + I)(D + sor_omega*U), and zero
* vector would be used as an initial guess on the first iteration.
*
*    PBiCGSTAB formulation step by step:
*       r_0 = b - A * x_0
*       r'_0 = r_0
*       rho_0 = alpha_0 = omega_0 = 1
*       v_0 = p_0 = 0
*
*       while not converged
*       {
*           rho_{k+1} = (r'_0, r_k)
*           beta = (rho_{k+1} / rho_k)/(alpha / omega_k)
*           p_{k+1} = r_k + beta * (p_k - omega_k * v_k)
*           y = M^-1 * p_{k+1}
*           v_{k+1} = A * y
*           alpha = rho_{k+1} / (r'_0, v_{k+1})
*           s = r_k - alpha * v_{k+1}
*           z = M^-1 * s
*           t = A * z
*           omega_{k+1} = (t, s)/(t, t)
*           x_{k+1} = x_k + alpha * y + omega_{k+1} * z
*           r_{k+1} = s - omega_{k+1} * t
*       }
*
*    Sparse matrix - dense vector multiplication and application of SOR implemented using Sparse
* BLAS Inspector-Executor functionality, inner product and some other vector operations are
* implemented using BLAS Level 1 functionality.
*/

#include "mkl.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>

#ifdef MKL_ILP64
#define INT_PRINT_FORMAT "%lld"
#else
#define INT_PRINT_FORMAT "%d"
#endif

#define tolerance 1.e-7

int main()
{
// ******************************************************************************
//     Declaration and initialization of parameters for sparse representation of
//     the matrix A in the compressed sparse row format:
// ******************************************************************************
#define M 6
#define NNZ 20
    // Sparse representation of the matrix A in CSR format
    MKL_INT csrRowPtr[M + 1] = {0, 3, 7, 10, 13, 17, 20};
    MKL_INT csrColInd[NNZ]   = {0, 1, 3, 0, 1, 2, 4, 1, 2, 5, 0, 3, 4, 1, 3, 4, 5, 2, 4, 5};
    double csrVal[NNZ] = {1.0,   -0.25, -0.25, -0.25, 1.0,   -0.25, -0.25, -0.25, 1.0,   -0.25,
                          -0.25, 1.0,   -0.25, -0.25, -0.25, 1.0,   -0.25, -0.25, -0.25, 1.0};

    // Descriptor of main sparse matrix properties
    struct matrix_descr descr_sym;

    descr_sym.type = SPARSE_MATRIX_TYPE_SYMMETRIC;
    descr_sym.mode = SPARSE_FILL_MODE_FULL;
    descr_sym.diag = SPARSE_DIAG_UNIT;

    // Structure with sparse matrix stored in CSR format
    sparse_matrix_t csrA;

    // Declaration of right hand side and initial solution vector
    double rhs[M] = {2.0, 1.0, 2.0, 2.0, 1.0, 2.0};
    double x[M]   = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6};

    // Declaration of local variables
    double r[M], r0[M], v[M], p[M], y[M], s[M], z[M], t[M];
    double alpha, beta, rho, rho_prev, omega;

    double one = 1.0, zero = 0.0, norm_of_residual = 1.0, initial_norm_of_residual = 1.0;
    MKL_INT i, k;
    MKL_INT expected_calls        = 10;
    MKL_INT initial_guess_is_zero = 1;

    // SOR preconditioner parameters (sor_omega and number of iterations per single
    // preconditioner application)
    double sor_omega = 0.15;
    MKL_INT loc_iter, num_loc_iters = 3;

    printf("   In this example preconditioned BiCGSTAB method with SOR preconditioner\n");
    printf("   solves following system of sparse linear system:\n");
    printf("\n");
    printf("      | 1.0 , -0.25,  0.0 , -0.25,  0.0 ,  0.0  |   |x_1|   |2.0|\n");
    printf("      |-0.25,  1.0 , -0.25,  0.0 , -0.25,  0.0  |   |x_2|   |1.0|\n");
    printf("      | 0.0 , -0.25,  1.0 ,  0.0 ,  0.0 , -0.25 | * |x_3| = |2.0|\n");
    printf("      |-0.25,  0.0 ,  0.0 ,  1.0 , -0.25,  0.0  |   |x_4|   |2.0|\n");
    printf("      | 0.0 , -0.25,  0.0 , -0.25,  1.0 , -0.25 |   |x_5|   |1.0|\n");
    printf("      | 0.0 ,  0.0 , -0.25,  0.0 , -0.25,  1.0  |   |x_6|   |2.0|\n");
    printf("\n");
    printf("   SOR preconditioner parameters are:\n");
    printf("   omega = %e, number of local iterations is %d\n", sor_omega, (int)num_loc_iters);
    printf("\n");
    printf("   Process stops after reducing norm of correction to %e\n", tolerance);
    printf("\n");

    // Create handle with matrix stored in CSR format
    mkl_sparse_d_create_csr(&csrA, SPARSE_INDEX_BASE_ZERO, M, M, csrRowPtr, csrRowPtr + 1,
                            csrColInd, csrVal);

    mkl_sparse_set_mv_hint(csrA, SPARSE_OPERATION_NON_TRANSPOSE, descr_sym, expected_calls);
    mkl_sparse_set_sorv_hint(SPARSE_SOR_FORWARD, csrA, descr_sym, expected_calls);

    // Analyze sparse matrix and choose proper kernels and workload balancing
    // strategy
    mkl_sparse_optimize(csrA);

    // r = b - A * x
    if (initial_guess_is_zero) {
        cblas_dscal(M, zero, x, 1);
        cblas_dcopy(M, rhs, 1, r, 1);
    }
    else {
        cblas_dcopy(M, rhs, 1, r, 1);
        mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, -one, csrA, descr_sym, x, one, r);
    }

    // r0 = r
    cblas_dcopy(M, r, 1, r0, 1);

    // rho_0 = alpha_0 = omega_0 = 1
    rho = alpha = omega = one;

    // v = p = 0
    cblas_dscal(M, zero, v, 1);
    cblas_dscal(M, zero, p, 1);

    // Initial norm of residual
    initial_norm_of_residual = cblas_dnrm2(M, r, 1);
    norm_of_residual         = initial_norm_of_residual;

    // Start of main PCG algorithm
    k = 0;

    while (norm_of_residual / initial_norm_of_residual > tolerance && k < 1000) {

        // rho_{k+1} = (r'_0, r_k)
        rho_prev = rho;
        rho      = cblas_ddot(M, r0, 1, r, 1);

        // beta = (rho_{k+1} / rho_k)/(alpha / omega_k)
        beta = (rho / rho_prev) / (alpha / omega);

        // p_{k+1} = r_k + beta * (p_k - omega_k * v_k)
        for (i = 0; i < M; i++) {
            p[i] = r[i] + beta * (p[i] - omega * v[i]);
        }

        // y = M^-1 * p_{k+1}
        for (loc_iter = 0; loc_iter < num_loc_iters; loc_iter++) {
            if (loc_iter == 0)
                mkl_sparse_d_sorv(SPARSE_SOR_FORWARD, descr_sym, csrA, sor_omega, zero, y, p);
            else
                mkl_sparse_d_sorv(SPARSE_SOR_FORWARD, descr_sym, csrA, sor_omega, one, y, p);
        }

        // v_{k+1} = A * y
        mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, one, csrA, descr_sym, y, zero, v);

        // alpha = rho_{k+1} / (r'_0, v_{k+1})
        alpha = rho / cblas_ddot(M, r0, 1, v, 1);

        // s = r_k - alpha * v_{k+1}
        for (i = 0; i < M; i++) {
            s[i] = r[i] - alpha * v[i];
        }

        // z = M^-1 * s
        for (loc_iter = 0; loc_iter < num_loc_iters; loc_iter++) {
            if (loc_iter == 0)
                mkl_sparse_d_sorv(SPARSE_SOR_FORWARD, descr_sym, csrA, sor_omega, zero, z, s);
            else
                mkl_sparse_d_sorv(SPARSE_SOR_FORWARD, descr_sym, csrA, sor_omega, one, z, s);
        }

        // t = A * z
        mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, one, csrA, descr_sym, z, zero, t);

        // omega_{k+1} = (t, s)/(t, t)
        omega = cblas_ddot(M, t, 1, s, 1) / cblas_ddot(M, t, 1, t, 1);

        // x_{k+1} = x_k + alpha * y + omega_{k+1} * z
        for (i = 0; i < M; i++) {
            x[i] += alpha * y[i] + omega * z[i];
        }

        // r_{k+1} = s - omega_{k+1} * t
        for (i = 0; i < M; i++) {
            r[i] = s[i] - omega * t[i];
        }

        // Calculate current norm of residual
        norm_of_residual = cblas_dnrm2(M, r, 1);
        printf("relative norm of residual on " INT_PRINT_FORMAT " iteration = %4.5e\n", ++k,
               norm_of_residual / initial_norm_of_residual);
    }

    printf("\n");
    printf("Preconditioned BiCGSTAB process successfully converge and following "
           "solution have been "
           "obtained using " INT_PRINT_FORMAT " iterations, relative norm of residual is %4.5e\n",
           k, norm_of_residual / initial_norm_of_residual);
    for (i = 0; i < M; i++)
        printf("x_" INT_PRINT_FORMAT " = %4.5f\n", i + 1, x[i]);

    // Release matrix handle and deallocate matrix
    mkl_sparse_destroy(csrA);

    return 0;
}
