#!/bin/bash
#===============================================================================
# Copyright 2024 Intel Corporation.
#
# This software and the related documents are Intel copyrighted  materials,  and
# your use of  them is  governed by the  express license  under which  they were
# provided to you (License).  Unless the License provides otherwise, you may not
# use, modify, copy, publish, distribute,  disclose or transmit this software or
# the related documents without Intel's prior written permission.
#
# This software and the related documents  are provided as  is,  with no express
# or implied  warranties,  other  than those  that are  expressly stated  in the
# License.
#===============================================================================

function build_hpcg {
    echo '========= Start Build ==============='

    echo "./configure IMPI_PVC"
    ./configure IMPI_PVC

    echo "make -j8 MKLROOT=${MKLROOT}"
    make -j8 MKLROOT=${MKLROOT}

    #
    # BASIC_PROFILING adds some event waits and overhead to collect wall times for 
    # different segments of CG algorithm which is output in several 
    # .json files upon completion.  Should not be used for official runs, but may
    # be helpful for profiling and debugging
    #

    #echo "make -j8 MKLROOT=${MKLROOT} BASIC_PROFILING=yes"
    #make -j8 MKLROOT=${MKLROOT} BASIC_PROFILING=yes

    echo '========= Finished Build ============'
}

function run_hpcg {


    # setup runtime configuration
    #
    export SYCL_DEVICE_ALLOWLIST=
    #export ZE_AFFINITY_MASK=1
    #export ONEAPI_DEVICE_SELECTOR="opencl:gpu"
    export ONEAPI_DEVICE_SELECTOR="level_zero:gpu"
    #unset ONEAPI_DEVICE_SELECTOR

    echo '========= Driver versions ==========='
    sycl-ls

    echo '========= MPI version ==============='
    mpiexec.hydra --version

    echo '========= ICX SYCL version =========='
    icpx --version

    echo '========= oneMKL PATH ==============='
    echo "MKLROOT = ${MKLROOT}"

    echo '========= Other Settings ============'
    echo "export SYCL_QUEUE_THREAD_POOL_SIZE=${SYCL_QUEUE_THREAD_POOL_SIZE}"

    echo '========= Start Run ================='

    #nprob=16
    #nprob=128
    #nprob=256
    nprob=320
    nx=${nprob}
    ny=${nprob}
    nz=${nprob}
    #nx=512
    #ny=512
    #nz=256

    exe=bin/xhpcg_impi_pvc

    run_time_in_seconds=60  # official submitable runs need this to be >= 1800

    #runRealRef=0 # good for faster completion of runs with only 50 iters (no comparison to reference CG)
    runRealRef=1 # official runs need to use this (may require >= 50 iters to match
                 # reference CG convergence after 50 iters, but will only count as 50)
    #runRealRef=2 # For small ranks (1,2,4 with size 320x320x320 or 1,2,4,8,12 with
                  # size 512x512x256), offers a shorter path to measuring reference
                  # code convergence after 50 iters of CG_ref to match with optimized
                  # CG (may require >= 50 iters, but will only count as 50)

    #nodefile=${SLURM_SUBMIT_DIR}/nodefile.${SLURM_JOBID}

    nprocs_per_node=2 # 1 card per node filled
    #nprocs_per_node=4 # 2 cards per node filled
    #nprocs_per_node=8 # 4 cards per node x4 OAM filled 
    #nprocs_per_node=12 # 6 cards per node x6 OAM Borealis filled 
    #nprocs_per_node=16 # 8 cards per node x8 OAM filled
    
    #nnodes=$SLURM_JOB_NUM_NODES
    nnodes=1
    
    nprocs=$(( nnodes*nprocs_per_node ))
    #nprocs=2  # or set it manually
    
    nthreads_per_rank=28 # for reference code openmp threads

    affinity=compact      # on each node, assign ranks to gpu tiles compactly filling card after card
    #affinity=round_robin # on each node, assign ranks to gpu tiles  round_robin, placing one in
                          # each card, then going back to fill second tile in each card

    #
    # Configure OpenMP on node, if relevant
    #
    export OMP_NUM_THREADS=${nthreads_per_rank}
    export OMP_PROC_BIND=close
    export OMP_PLACES=threads

    #
    # Intel(R) MPI Library flags
    #
    export I_MPI_FABRICS=shm:ofi
    export I_MPI_PIN_DOMAIN=auto
    export I_MPI_PIN_ORDER=bunch
    export I_MPI_DEBUG=5  # print out runtime mapping of mpi ranks to hardware and threads


    #
    # runtime parameter summary
    #

    echo " ======== ${exe} ========="
    #echo " ===       nodefile: ${nodefile}"
    echo " ===         nnodes: ${nnodes}"
    echo " ===            ppn: ${nprocs_per_node}"
    echo " ===         nprocs: ${nprocs}"
    echo " ===       nthreads: ${nthreads_per_rank}"
    echo " ===  affinity/node: ${affinity}"
    echo " ===   run-real-ref: ${runRealRef}"
    echo " ===      prob_size: ${nx} x ${ny} x ${nz}"
    echo " ===       run_time: ${run_time_in_seconds}"


    # with nodefile
    #echo "mpiexec.hydra --genvall -np ${nprocs} --ppn ${nprocs_per_node} -f ${nodefile}  ${exe} --nx=${nx} --ny=${ny} --nz=${nz} -t${run_time_in_seconds} --run-real-ref=${runRealRef} --affinity-per-node=${affinity}"
    #mpiexec.hydra --genvall -np ${nprocs} --ppn ${nprocs_per_node} -f ${nodefile} ${exe}  --nx=${nx} --ny=${ny} --nz=${nz} -t${run_time_in_seconds} --run-real-ref=${runRealRef} --affinity-per-node=${affinity}

    # without nodefile
    echo "mpiexec.hydra --genvall -np ${nprocs} --ppn ${nprocs_per_node} ${exe} --nx=${nx} --ny=${ny} --nz=${nz} -t${run_time_in_seconds} --run-real-ref=${runRealRef} --affinity-per-node=${affinity}"
    mpiexec.hydra --genvall -np ${nprocs} --ppn ${nprocs_per_node} ${exe}  --nx=${nx} --ny=${ny} --nz=${nz} -t${run_time_in_seconds} --run-real-ref=${runRealRef} --affinity-per-node=${affinity}

    echo '========= Finished ================='
}



#
# Setup environment with dependencies, then build and run
# (recommend to update to latest released versions, this is minimal dependencies)
#

#
# select Intel(R) oneAPI Math Kernel Library (oneMKL) version
#
source  /opt/intel/oneapi/mkl/2024.0/env/vars.sh intel64
# note we need MKLROOT to be defined and we need to ensure that LD_LIBRARY_PATH 
# contains the path to the oneMKL shared libraries (export LD_LIBRARY_PATH=${MKLROOT}/lib:${LD_LIBRARY_PATH} )

#
# Select Intel(R) oneAPI DPC++ Compiler (should be compatible with oneMKL version)
#
source /opt/intel/oneapi/compiler/2024.0/env/vars.sh intel64

#
# Select Intel(R) MPI Library version 
#
source /opt/intel/oneapi/mpi/2021.11.0/env/vars.sh intel64

#
# Source Intel(R) oneAPI DPC++ Library (oneDPL) version
#
source /opt/intel/oneapi/dpl/2022.3/env/vars.sh intel64



#
# build hpcg binaries
#
build_hpcg

#
# run hpcg binaries
#
run_hpcg
