/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

/*--------------------------------------------------------------------------
 * Test driver for unstructured matrix interface (IJ).
 *
 * It tests the assembly phase of an IJ matrix in both CPU and GPU.
 *--------------------------------------------------------------------------*/

#include "HYPRE.h"
#include "HYPRE_utilities.h"
#include "_hypre_IJ_mv.h"
#include "_hypre_parcsr_mv.h"
#include "HYPRE_parcsr_ls.h"
#include "_hypre_utilities.h"
//#include "_hypre_utilities.hpp"

HYPRE_Int buildMatrixEntries(MPI_Comm comm,
                             HYPRE_Int nx, HYPRE_Int ny, HYPRE_Int nz,
                             HYPRE_Int Px, HYPRE_Int Py, HYPRE_Int Pz,
                             HYPRE_Real cx, HYPRE_Real cy, HYPRE_Real cz, HYPRE_Int base,
                             HYPRE_BigInt *ilower, HYPRE_BigInt *iupper,
                             HYPRE_BigInt *jlower, HYPRE_BigInt *jupper,
                             HYPRE_Int *nrows, HYPRE_BigInt *num_nonzeros,
                             HYPRE_Int **nnzrow_ptr, HYPRE_BigInt **rows_ptr,
                             HYPRE_BigInt **rows_coo_ptr, HYPRE_BigInt **cols_ptr,
                             HYPRE_Real **coefs_ptr, HYPRE_Int stencil, HYPRE_ParCSRMatrix *parcsr_ptr);

HYPRE_Int getParCSRMatrixData(HYPRE_ParCSRMatrix  A, HYPRE_Int base, HYPRE_Int *nrows_ptr,
                              HYPRE_BigInt *num_nonzeros_ptr,
                              HYPRE_Int **nnzrow_ptr, HYPRE_BigInt **rows_ptr, HYPRE_BigInt **rows_coo_ptr,
                              HYPRE_BigInt **cols_ptr, HYPRE_Real **coefs_ptr);

HYPRE_Real checkMatrix(HYPRE_ParCSRMatrix parcsr_ref, HYPRE_IJMatrix ij_A);

HYPRE_Int test_all(MPI_Comm comm, const char *test_name, HYPRE_MemoryLocation memory_location,
                   HYPRE_Int option, const char *cmd_sequence, HYPRE_BigInt ilower, HYPRE_BigInt iupper,
                   HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, HYPRE_BigInt num_nonzeros,
                   HYPRE_Int nchunks, HYPRE_Int init_alloc, HYPRE_Int early_assemble, HYPRE_Real grow_factor,
                   HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs,
                   HYPRE_IJMatrix *ij_A_ptr);

hypre_int
main( hypre_int  argc,
      char      *argv[] )
{
   MPI_Comm                  comm = hypre_MPI_COMM_WORLD;
   HYPRE_Int                 num_procs;
   HYPRE_Int                 myid;
   HYPRE_Int                 arg_index;
   HYPRE_Int                 time_index;
   HYPRE_Int                 print_usage;
   char                      memory_location_name[8];
   HYPRE_Int                 nrows;
   HYPRE_BigInt              num_nonzeros;
   HYPRE_BigInt              ilower, iupper;
   HYPRE_BigInt              jlower, jupper;
   HYPRE_Int                *nnzrow = NULL, *h_nnzrow, *d_nnzrow = NULL;
   HYPRE_BigInt             *rows = NULL,   *h_rows,   *d_rows = NULL;
   HYPRE_BigInt             *rows_coo = NULL,  *h_rows_coo,  *d_rows_coo = NULL;
   HYPRE_BigInt             *cols = NULL,   *h_cols,   *d_cols = NULL;
   HYPRE_Real               *coefs = NULL,  *h_coefs,  *d_coefs = NULL;
   HYPRE_IJMatrix            ij_A, ij_B, ij_AT;
   HYPRE_ParCSRMatrix        parcsr_ref = NULL, parcsr_trans = NULL, parcsr_B = NULL;
   void                     *obj_B;

   /* Driver input parameters */
   HYPRE_Int                 Px, Py, Pz;
   HYPRE_Int                 nx, ny, nz;
   HYPRE_Real                cx, cy, cz;
   HYPRE_Int                 nchunks;
   HYPRE_Int                 mode, ierr = 0;
   HYPRE_Real                tol = HYPRE_REAL_EPSILON;
   HYPRE_Int                 option, base;
   HYPRE_Int                 stencil;
   HYPRE_Int                 print_matrix;
   HYPRE_Int                 init_alloc = -1;
   HYPRE_Int                 early_assemble = 0;
   HYPRE_Real                grow_factor = -1.0;

   /* default execution policy and memory space */
#if defined(HYPRE_TEST_USING_HOST)
   HYPRE_MemoryLocation memory_location = HYPRE_MEMORY_HOST;
   HYPRE_ExecutionPolicy default_exec_policy = HYPRE_EXEC_HOST;
#else
   HYPRE_MemoryLocation memory_location = HYPRE_MEMORY_DEVICE;
   HYPRE_ExecutionPolicy default_exec_policy = HYPRE_EXEC_DEVICE;
#endif

   /* Initialize MPI */
   hypre_MPI_Init(&argc, &argv);
   hypre_MPI_Comm_size(comm, &num_procs );
   hypre_MPI_Comm_rank(comm, &myid );

   /* Check memory location and exec policy */
   for (arg_index = 1; arg_index < argc; arg_index ++)
   {
      if ( strcmp(argv[arg_index], "-memory_host") == 0 )
      {
         memory_location = HYPRE_MEMORY_HOST;
      }
      else if ( strcmp(argv[arg_index], "-memory_device") == 0 )
      {
         memory_location = HYPRE_MEMORY_DEVICE;
      }
      else if ( strcmp(argv[arg_index], "-exec_host") == 0 )
      {
         default_exec_policy = HYPRE_EXEC_HOST;
      }
      else if ( strcmp(argv[arg_index], "-exec_device") == 0 )
      {
         default_exec_policy = HYPRE_EXEC_DEVICE;
      }
   }

   /*-----------------------------------------------------------------
    * GPU Device binding
    * Must be done before HYPRE_Initialize() and should not be changed after
    *-----------------------------------------------------------------*/
   if (default_exec_policy == HYPRE_EXEC_DEVICE)
   {
      hypre_bind_device_id(-1, myid, num_procs, hypre_MPI_COMM_WORLD);
   }

   /* Initialize Hypre: must be the first Hypre function to call */
   time_index = hypre_InitializeTiming("Hypre init");
   hypre_BeginTiming(time_index);

   HYPRE_Initialize();
   HYPRE_SetMemoryLocation(memory_location);
   HYPRE_SetExecutionPolicy(default_exec_policy);
   if (default_exec_policy == HYPRE_EXEC_DEVICE)
   {
      HYPRE_DeviceInitialize();
   }

   hypre_EndTiming(time_index);
   hypre_PrintTiming("Hypre init times", hypre_MPI_COMM_WORLD);
   hypre_FinalizeTiming(time_index);
   hypre_ClearTiming();

   /*-----------------------------------------------------------
    * Set default parameters
    *-----------------------------------------------------------*/
   Px = num_procs;
   Py = 1;
   Pz = 1;

   nx = 50;
   ny = 51;
   nz = 52;

   cx = 1.0;
   cy = 2.0;
   cz = 3.0;

   mode    = (1 << 7) - 1;
   option  = 1;
   nchunks = 3;
   base    = 0;
   stencil = 7;

   print_matrix = 0;

   /*-----------------------------------------------------------
    * Parse command line
    *-----------------------------------------------------------*/
   print_usage = 0;
   arg_index = 1;
   while ( (arg_index < argc) && (!print_usage) )
   {
      if ( strcmp(argv[arg_index], "-P") == 0 )
      {
         arg_index++;
         Px = atoi(argv[arg_index++]);
         Py = atoi(argv[arg_index++]);
         Pz = atoi(argv[arg_index++]);
      }
      else if ( strcmp(argv[arg_index], "-n") == 0 )
      {
         arg_index++;
         nx  = atoi(argv[arg_index++]);
         ny  = atoi(argv[arg_index++]);
         nz  = atoi(argv[arg_index++]);
      }
      else if ( strcmp(argv[arg_index], "-c") == 0 )
      {
         arg_index++;
         cx = (HYPRE_Real)atof(argv[arg_index++]);
         cy = (HYPRE_Real)atof(argv[arg_index++]);
         cz = (HYPRE_Real)atof(argv[arg_index++]);
      }
      else if ( strcmp(argv[arg_index], "-mode") == 0 )
      {
         arg_index++;
         mode = atoi(argv[arg_index++]);
      }
      else if ( strcmp(argv[arg_index], "-option") == 0 )
      {
         arg_index++;
         option = atoi(argv[arg_index++]);
      }
      else if ( strcmp(argv[arg_index], "-9pt") == 0 )
      {
         arg_index++;
         stencil = 9;
      }
      else if ( strcmp(argv[arg_index], "-27pt") == 0 )
      {
         arg_index++;
         stencil = 27;
      }
      else if ( strcmp(argv[arg_index], "-nchunks") == 0 )
      {
         arg_index++;
         nchunks = atoi(argv[arg_index++]);
      }
      else if ( strcmp(argv[arg_index], "-base") == 0 )
      {
         arg_index++;
         base = atoi(argv[arg_index++]);
      }
      else if ( strcmp(argv[arg_index], "-init") == 0 )
      {
         arg_index++;
         init_alloc = atoi(argv[arg_index++]);
      }
      else if ( strcmp(argv[arg_index], "-early") == 0 )
      {
         arg_index++;
         early_assemble = atoi(argv[arg_index++]);
      }
      else if ( strcmp(argv[arg_index], "-grow") == 0 )
      {
         arg_index++;
         grow_factor = (HYPRE_Real) atof(argv[arg_index++]);
      }
      else if ( strcmp(argv[arg_index], "-print") == 0 )
      {
         arg_index++;
         print_matrix = 1;
      }
      else
      {
         print_usage = 1; break;
      }
   }

   /*-----------------------------------------------------------
    * Safety checks
    *-----------------------------------------------------------*/
   if (Px * Py * Pz != num_procs)
   {
      hypre_printf("Px x Py x Pz is different than the number of MPI processes");
      return (-1);
   }

   /*-----------------------------------------------------------
    * Print usage info
    *-----------------------------------------------------------*/
   if ( print_usage )
   {
      if ( myid == 0 )
      {
         hypre_printf("\n");
         hypre_printf("Usage: %s [<options>]\n", argv[0]);
         hypre_printf("\n");
         hypre_printf("      -n <nx> <ny> <nz>      : total problem size \n");
         hypre_printf("      -P <Px> <Py> <Pz>      : processor topology\n");
         hypre_printf("      -c <cx> <cy> <cz>      : diffusion coefficients\n");
         hypre_printf("      -memory_location <val> : memory location of the assembled matrix\n");
         hypre_printf("             0 = HOST\n");
         hypre_printf("             1 = DEVICE (default)\n");
         hypre_printf("      -nchunks <val>         : number of chunks passed to Set/AddValues\n");
         hypre_printf("      -base <val>            : matrix index base\n");
         hypre_printf("      -mode <val>            : tests to be performed (code)\n");
         hypre_printf("             1 = Set (Default)      (sA)\n");
         hypre_printf("             2 = SetOffProc         (aaaaaA)\n");
         hypre_printf("             4 = SetSet             (ssA)\n");
         hypre_printf("             8 = AddSet             (asA)\n");
         hypre_printf("            16 = SetAdd             (saA)\n");
         hypre_printf("            32 = SetAddAssembleSet  (saAsA)\n");
         hypre_printf("            64 = AddAddAddAddAddSet (aaaaasA)\n");
         hypre_printf("           128 = SetAssembleGet     (sAg)\n");
         hypre_printf("      -option <val>          : interface option of Set/AddToValues\n");
         hypre_printf("             1 = CSR-like (default)\n");
         hypre_printf("             2 = COO-like\n");
         hypre_printf("      -print                 : print matrices\n");
         hypre_printf("\n");
      }

      return (0);
   }

#if defined(HYPRE_USING_MEMORY_TRACKER)
   hypre_MemoryTrackerSetPrint(1);
#endif

   /*-----------------------------------------------------------
    * Print driver parameters
    *-----------------------------------------------------------*/
   switch (memory_location)
   {
      case HYPRE_MEMORY_UNDEFINED:
         return -1;

      case HYPRE_MEMORY_DEVICE:
         hypre_sprintf(memory_location_name, "Device"); break;

      case HYPRE_MEMORY_HOST:
         hypre_sprintf(memory_location_name, "Host"); break;
   }

   if (myid == 0)
   {
      hypre_printf("  Memory location: %s\n", memory_location_name);
      hypre_printf("    (nx, ny, nz) = (%b, %b, %b)\n", nx, ny, nz);
      hypre_printf("    (Px, Py, Pz) = (%d, %d, %d)\n", Px, Py, Pz);
      hypre_printf("    (cx, cy, cz) = (%f, %f, %f)\n", cx, cy, cz);
      hypre_printf("\n");
   }

   /* default memory location */
   HYPRE_SetMemoryLocation(memory_location);

   /* default execution policy */
   HYPRE_SetExecutionPolicy(default_exec_policy);

#if defined(HYPRE_USING_OPENMP)
   if (hypre_GetExecPolicy1(memory_location) == HYPRE_EXEC_HOST)
   {
      mode = mode & ~2; /* skip AddTranspose with OMP */
   }
#endif

   /*-----------------------------------------------------------
    * Build matrix entries
    *-----------------------------------------------------------*/
   buildMatrixEntries(comm, nx, ny, nz, Px, Py, Pz, cx, cy, cz, base,
                      &ilower, &iupper, &jlower, &jupper, &nrows, &num_nonzeros,
                      &h_nnzrow, &h_rows, &h_rows_coo, &h_cols, &h_coefs, stencil,
                      &parcsr_ref);

   switch (memory_location)
   {
      case HYPRE_MEMORY_DEVICE:
         d_nnzrow = hypre_TAlloc(HYPRE_Int,    nrows,        HYPRE_MEMORY_DEVICE);
         d_rows   = hypre_TAlloc(HYPRE_BigInt, nrows,        HYPRE_MEMORY_DEVICE);
         d_rows_coo  = hypre_TAlloc(HYPRE_BigInt, num_nonzeros, HYPRE_MEMORY_DEVICE);
         d_cols   = hypre_TAlloc(HYPRE_BigInt, num_nonzeros, HYPRE_MEMORY_DEVICE);
         d_coefs  = hypre_TAlloc(HYPRE_Real,   num_nonzeros, HYPRE_MEMORY_DEVICE);

         hypre_TMemcpy(d_nnzrow, h_nnzrow, HYPRE_Int, nrows,
                       HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
         hypre_TMemcpy(d_rows, h_rows, HYPRE_BigInt, nrows,
                       HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
         hypre_TMemcpy(d_rows_coo, h_rows_coo, HYPRE_BigInt, num_nonzeros,
                       HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
         hypre_TMemcpy(d_cols, h_cols, HYPRE_BigInt, num_nonzeros,
                       HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
         hypre_TMemcpy(d_coefs, h_coefs, HYPRE_Real, num_nonzeros,
                       HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);

         nnzrow = d_nnzrow;
         rows   = d_rows;
         rows_coo  = d_rows_coo;
         cols   = d_cols;
         coefs  = d_coefs;
         break;

      case HYPRE_MEMORY_HOST:
         nnzrow = h_nnzrow;
         rows   = h_rows;
         rows_coo  = h_rows_coo;
         cols   = h_cols;
         coefs  = h_coefs;
         break;

      case HYPRE_MEMORY_UNDEFINED:
         return -1;
   }

   /*-----------------------------------------------------------
    * Test different Set/Add combinations
    *-----------------------------------------------------------*/
   /* Test Set */
   if (mode & 1)
   {
      test_all(comm, "set", memory_location, option, "sA",
               ilower, iupper, jlower, jupper, nrows, num_nonzeros,
               nchunks, init_alloc, early_assemble, grow_factor,
               h_nnzrow, nnzrow, option == 1 ? rows : rows_coo,
               cols, coefs, &ij_A);

      ierr += checkMatrix(parcsr_ref, ij_A) > tol;
      if (print_matrix)
      {
         HYPRE_IJMatrixPrint(ij_A, "ij_Set");
      }
      HYPRE_IJMatrixDestroy(ij_A);
   }

   /* Test AddTranspose
    * set values with (row, col) reversed, i.e., the transpose of A
    * in this way, we can test off-proc add to values
    */
   if (mode & 2)
   {
      test_all(comm, "addtrans", memory_location, 2, "aaaaaA",
               ilower, iupper, jlower, jupper, nrows, num_nonzeros,
               nchunks, init_alloc, early_assemble, grow_factor,
               h_nnzrow, nnzrow, cols, rows_coo, coefs, &ij_AT);

      hypre_ParCSRMatrixTranspose(parcsr_ref, &parcsr_trans, 1);
      hypre_ParCSRMatrixScale(parcsr_trans, 5.0);

      ierr += checkMatrix(parcsr_trans, ij_AT) > tol;
      if (print_matrix)
      {
         HYPRE_IJMatrixPrint(ij_AT, "ij_AddTrans");
      }
      HYPRE_IJMatrixDestroy(ij_AT);
      HYPRE_ParCSRMatrixDestroy(parcsr_trans);
   }

   /* Test Set/Set */
   if (mode & 4)
   {
      test_all(comm, "set/set", memory_location, option, "ssA",
               ilower, iupper, jlower, jupper, nrows, num_nonzeros,
               nchunks, init_alloc, early_assemble, grow_factor,
               h_nnzrow, nnzrow, option == 1 ? rows : rows_coo,
               cols, coefs, &ij_A);

      ierr += checkMatrix(parcsr_ref, ij_A) > tol;
      if (print_matrix)
      {
         HYPRE_IJMatrixPrint(ij_A, "ij_SetSet");
      }
      HYPRE_IJMatrixDestroy(ij_A);
   }

   /* Test Add/Set */
   if (mode & 8)
   {
      test_all(comm, "add/set", memory_location, option, "asA",
               ilower, iupper, jlower, jupper, nrows, num_nonzeros,
               nchunks, init_alloc, early_assemble, grow_factor,
               h_nnzrow, nnzrow, option == 1 ? rows : rows_coo,
               cols, coefs, &ij_A);

      ierr += checkMatrix(parcsr_ref, ij_A) > tol;
      if (print_matrix)
      {
         HYPRE_IJMatrixPrint(ij_A, "ij_AddSet");
      }
      HYPRE_IJMatrixDestroy(ij_A);
   }

   /* Test Set/Add */
   if (mode & 16)
   {
      test_all(comm, "set/add", memory_location, option, "saA",
               ilower, iupper, jlower, jupper, nrows, num_nonzeros,
               nchunks, init_alloc, early_assemble, grow_factor,
               h_nnzrow, nnzrow, option == 1 ? rows : rows_coo,
               cols, coefs, &ij_A);

      hypre_ParCSRMatrix *parcsr_ref2 = hypre_ParCSRMatrixClone(parcsr_ref, 1);
      hypre_ParCSRMatrixScale(parcsr_ref2, 2.0);

      ierr += checkMatrix(parcsr_ref2, ij_A) > tol;
      if (print_matrix)
      {
         HYPRE_IJMatrixPrint(ij_A, "ij_SetAdd");
      }
      HYPRE_IJMatrixDestroy(ij_A);
      HYPRE_ParCSRMatrixDestroy(parcsr_ref2);
   }

   /* Test Set/Add/Assemble/Set */
   if (mode & 32)
   {
      test_all(comm, "set/add/assemble/set", memory_location, option, "saAsA",
               ilower, iupper, jlower, jupper, nrows, num_nonzeros,
               nchunks, init_alloc, early_assemble, grow_factor,
               h_nnzrow, nnzrow, option == 1 ? rows : rows_coo,
               cols, coefs, &ij_A);

      ierr += checkMatrix(parcsr_ref, ij_A) > tol;
      if (print_matrix)
      {
         HYPRE_IJMatrixPrint(ij_A, "ij_SetAddAssembleSet");
      }
      HYPRE_IJMatrixDestroy(ij_A);
   }

   /* Test Adds */
   if (mode & 64)
   {
      test_all(comm, "5adds/set", memory_location, option, "aaaaasA",
               ilower, iupper, jlower, jupper, nrows, num_nonzeros,
               nchunks, init_alloc, early_assemble, grow_factor,
               h_nnzrow, nnzrow, option == 1 ? rows : rows_coo,
               cols, coefs, &ij_A);

      hypre_ParCSRMatrix *parcsr_ref2 = hypre_ParCSRMatrixClone(parcsr_ref, 1);
      hypre_ParCSRMatrixScale(parcsr_ref2, 1.);

      ierr += checkMatrix(parcsr_ref2, ij_A) > tol;
      if (print_matrix)
      {
         HYPRE_IJMatrixPrint(ij_A, "ij_5AddsSet");
      }
      HYPRE_IJMatrixDestroy(ij_A);
      HYPRE_ParCSRMatrixDestroy(parcsr_ref2);
   }

   /* Test Get */
   if (mode & 128)
   {
      test_all(comm, "set/get", memory_location, option, "sAg",
               ilower, iupper, jlower, jupper, nrows, num_nonzeros,
               nchunks, init_alloc, early_assemble, grow_factor,
               h_nnzrow, nnzrow, option == 1 ? rows : rows_coo,
               cols, coefs, &ij_A);

      /* Create matrix with (rows, cols, coefs) gotten from ij_A */
      HYPRE_IJMatrixCreate(comm, ilower, iupper, jlower, jupper, &ij_B);
      HYPRE_IJMatrixSetObjectType(ij_B, HYPRE_PARCSR);
      HYPRE_IJMatrixInitialize_v2(ij_B, memory_location);
      HYPRE_IJMatrixSetValues(ij_B, nrows, nnzrow, rows, cols, coefs);
      HYPRE_IJMatrixAssemble(ij_B);
      HYPRE_IJMatrixMigrate(ij_B, HYPRE_MEMORY_HOST);
      HYPRE_IJMatrixGetObject(ij_B, &obj_B);
      parcsr_B = (HYPRE_ParCSRMatrix) obj_B;

      /* Check matrices */
      ierr += checkMatrix(parcsr_B, ij_A) > tol;

      if (print_matrix)
      {
         HYPRE_IJMatrixPrint(ij_A, "ij_sAg");
      }
      HYPRE_IJMatrixDestroy(ij_A);
      HYPRE_IJMatrixDestroy(ij_B);
   }

   /* Print the error code */
   hypre_ParPrintf(comm, "Test error code = %d\n", ierr);

   /*-----------------------------------------------------------
    * Free memory
    *-----------------------------------------------------------*/
   if (memory_location == HYPRE_MEMORY_DEVICE)
   {
      hypre_TFree(d_nnzrow, HYPRE_MEMORY_DEVICE);
      hypre_TFree(d_rows,   HYPRE_MEMORY_DEVICE);
      hypre_TFree(d_rows_coo,  HYPRE_MEMORY_DEVICE);
      hypre_TFree(d_cols,   HYPRE_MEMORY_DEVICE);
      hypre_TFree(d_coefs,  HYPRE_MEMORY_DEVICE);
   }
   hypre_TFree(h_nnzrow, HYPRE_MEMORY_HOST);
   hypre_TFree(h_rows,   HYPRE_MEMORY_HOST);
   hypre_TFree(h_rows_coo,  HYPRE_MEMORY_HOST);
   hypre_TFree(h_cols,   HYPRE_MEMORY_HOST);
   hypre_TFree(h_coefs,  HYPRE_MEMORY_HOST);

   HYPRE_ParCSRMatrixDestroy(parcsr_ref);

   /* Finalize Hypre */
   HYPRE_Finalize();

   /* Finalize MPI */
   hypre_MPI_Finalize();

   /* when using cuda-memcheck --leak-check full, uncomment this */
#if defined(HYPRE_USING_GPU) && !defined(HYPRE_TEST_USING_HOST)
   hypre_ResetDevice();
#endif

   return (0);
}

HYPRE_Int
buildMatrixEntries(MPI_Comm            comm,
                   HYPRE_Int           nx,
                   HYPRE_Int           ny,
                   HYPRE_Int           nz,
                   HYPRE_Int           Px,
                   HYPRE_Int           Py,
                   HYPRE_Int           Pz,
                   HYPRE_Real          cx,
                   HYPRE_Real          cy,
                   HYPRE_Real          cz,
                   HYPRE_Int           base,
                   HYPRE_BigInt       *ilower_ptr,
                   HYPRE_BigInt       *iupper_ptr,
                   HYPRE_BigInt       *jlower_ptr,
                   HYPRE_BigInt       *jupper_ptr,
                   HYPRE_Int          *nrows_ptr,
                   HYPRE_BigInt       *num_nonzeros_ptr,
                   HYPRE_Int         **nnzrow_ptr,
                   HYPRE_BigInt      **rows_ptr,   /* row indices of length nrows */
                   HYPRE_BigInt      **rows_coo_ptr,  /* row indices of length nnz */
                   HYPRE_BigInt      **cols_ptr,   /* col indices of length nnz */
                   HYPRE_Real        **coefs_ptr,  /* values of length nnz */
                   HYPRE_Int           stencil,
                   HYPRE_ParCSRMatrix *parcsr_ptr)
{
   HYPRE_Int          num_procs;
   HYPRE_Int          myid;
   HYPRE_Real         values[4];
   HYPRE_ParCSRMatrix A = NULL;

   hypre_MPI_Comm_size(comm, &num_procs );
   hypre_MPI_Comm_rank(comm, &myid );

   HYPRE_Int ip = myid % Px;
   HYPRE_Int iq = (( myid - ip) / Px) % Py;
   HYPRE_Int ir = ( myid - ip - Px * iq) / ( Px * Py );

   values[0] = 0;
   values[1] = -cx;
   values[2] = -cy;
   values[3] = -cz;

   if (stencil == 7)
   {
      A = (HYPRE_ParCSRMatrix) GenerateLaplacian(comm, nx, ny, nz, Px, Py, Pz, ip, iq, ir, values);
   }
   else if (stencil == 9)
   {
      A = (HYPRE_ParCSRMatrix) GenerateLaplacian9pt(comm, nx, ny, Px, Py, ip, iq, values);
   }
   else if (stencil == 27)
   {
      A = (HYPRE_ParCSRMatrix) GenerateLaplacian27pt(comm, nx, ny, nz, Px, Py, Pz, ip, iq, ir, values);
   }
   else
   {
      hypre_assert(0);
   }

   hypre_ParCSRMatrixMigrate(A, HYPRE_MEMORY_HOST);

   /* modify the upper triangular part to A nonsymmetric */
   HYPRE_Int           i, j;
   HYPRE_Int           nrows    = hypre_ParCSRMatrixNumRows(A);
   hypre_CSRMatrix    *A_diag   = hypre_ParCSRMatrixDiag(A);
   hypre_CSRMatrix    *A_offd   = hypre_ParCSRMatrixOffd(A);
   HYPRE_Int          *A_diag_i = hypre_CSRMatrixI(A_diag);
   HYPRE_Int          *A_diag_j = hypre_CSRMatrixJ(A_diag);
   HYPRE_Int          *A_offd_i = hypre_CSRMatrixI(A_offd);
   HYPRE_Int          *A_offd_j = hypre_CSRMatrixJ(A_offd);
   HYPRE_BigInt       *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A);
   HYPRE_BigInt       ilower = hypre_ParCSRMatrixFirstRowIndex(A);
   HYPRE_BigInt       jlower = hypre_ParCSRMatrixFirstColDiag(A);

   for (i = 0; i < nrows; i++)
   {
      for (j = A_diag_i[i]; j < A_diag_i[i + 1]; j++)
      {
         HYPRE_BigInt row = ilower + (HYPRE_BigInt) i;
         HYPRE_BigInt col = jlower + (HYPRE_BigInt) A_diag_j[j];
         if (row < col)
         {
            hypre_CSRMatrixData(A_diag)[j] += (HYPRE_Real) myid + .89;
         }
      }
      for (j = A_offd_i[i]; j < A_offd_i[i + 1]; j++)
      {
         HYPRE_BigInt row = ilower + (HYPRE_BigInt) i;
         HYPRE_BigInt col = col_map_offd_A[A_offd_j[j]];
         if (row < col)
         {
            hypre_CSRMatrixData(A_offd)[j] += (HYPRE_Real) myid + .64;
         }
      }
   }

   /* get I, J, data from A */
   getParCSRMatrixData(A, base, nrows_ptr, num_nonzeros_ptr, nnzrow_ptr,
                       rows_ptr, rows_coo_ptr, cols_ptr, coefs_ptr);

   /* Set pointers */
   *ilower_ptr = hypre_ParCSRMatrixFirstRowIndex(A) + base;
   *iupper_ptr = hypre_ParCSRMatrixLastRowIndex(A) + base;
   *jlower_ptr = hypre_ParCSRMatrixFirstColDiag(A) + base;
   *jupper_ptr = hypre_ParCSRMatrixLastColDiag(A) + base;
   *parcsr_ptr = A;

   return hypre_error_flag;
}

HYPRE_Int
getParCSRMatrixData(HYPRE_ParCSRMatrix  A,
                    HYPRE_Int           base,
                    HYPRE_Int          *nrows_ptr,
                    HYPRE_BigInt       *num_nonzeros_ptr,
                    HYPRE_Int         **nnzrow_ptr,
                    HYPRE_BigInt      **rows_ptr,
                    HYPRE_BigInt      **rows_coo_ptr,
                    HYPRE_BigInt      **cols_ptr,
                    HYPRE_Real        **coefs_ptr)
{
   hypre_CSRMatrix    *A_diag   = hypre_ParCSRMatrixDiag(A);
   hypre_CSRMatrix    *A_offd   = hypre_ParCSRMatrixOffd(A);
   HYPRE_Int          *A_diag_i = hypre_CSRMatrixI(A_diag);
   HYPRE_Int          *A_diag_j = hypre_CSRMatrixJ(A_diag);
   HYPRE_Int          *A_offd_i = hypre_CSRMatrixI(A_offd);
   HYPRE_Int          *A_offd_j = hypre_CSRMatrixJ(A_offd);
   HYPRE_BigInt       *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A);

   HYPRE_BigInt       ilower = hypre_ParCSRMatrixFirstRowIndex(A);
   HYPRE_BigInt       jlower = hypre_ParCSRMatrixFirstColDiag(A);

   HYPRE_Int          nrows;
   HYPRE_BigInt       num_nonzeros;
   HYPRE_Int         *nnzrow;
   HYPRE_BigInt      *rows;
   HYPRE_BigInt      *rows_coo;
   HYPRE_BigInt      *cols;
   HYPRE_Real        *coefs;
   HYPRE_Int          i, j, k;

   nrows  = hypre_ParCSRMatrixNumRows(A);
   num_nonzeros = hypre_CSRMatrixNumNonzeros(A_diag) + hypre_CSRMatrixNumNonzeros(A_offd);
   nnzrow = hypre_CTAlloc(HYPRE_Int,    nrows,        HYPRE_MEMORY_HOST);
   rows   = hypre_CTAlloc(HYPRE_BigInt, nrows,        HYPRE_MEMORY_HOST);
   rows_coo  = hypre_CTAlloc(HYPRE_BigInt, num_nonzeros, HYPRE_MEMORY_HOST);
   cols   = hypre_CTAlloc(HYPRE_BigInt, num_nonzeros, HYPRE_MEMORY_HOST);
   coefs  = hypre_CTAlloc(HYPRE_Real,   num_nonzeros, HYPRE_MEMORY_HOST);

   k = 0;
#if 0
   for (i = 0; i < nrows; i++)
   {
      nnzrow[i] = A_diag_i[i + 1] - A_diag_i[i] +
                  A_offd_i[i + 1] - A_offd_i[i];
      rows[i]   = ilower + i;

      for (j = A_diag_i[i]; j < A_diag_i[i + 1]; j++)
      {
         rows_coo[k]   = ilower + (HYPRE_BigInt) i;
         cols[k]    = jlower + (HYPRE_BigInt) A_diag_j[j];
         coefs[k++] = hypre_CSRMatrixData(A_diag)[j];
      }
      for (j = A_offd_i[i]; j < A_offd_i[i + 1]; j++)
      {
         rows_coo[k]   = ilower + (HYPRE_BigInt) i;
         cols[k]    = hypre_ParCSRMatrixColMapOffd(A)[A_offd_j[j]];
         coefs[k++] = hypre_CSRMatrixData(A_offd)[j];
      }
   }
#else
   for (i = nrows - 1; i >= 0; i--)
   {
      nnzrow[nrows - 1 - i] = A_diag_i[i + 1] - A_diag_i[i] +
                              A_offd_i[i + 1] - A_offd_i[i];
      rows[nrows - 1 - i]   = ilower + i + base;

      for (j = A_diag_i[i]; j < A_diag_i[i + 1]; j++)
      {
         rows_coo[k]   = ilower + (HYPRE_BigInt) i + base;
         cols[k]    = jlower + (HYPRE_BigInt) A_diag_j[j] + base;
         coefs[k++] = hypre_CSRMatrixData(A_diag)[j];
      }
      for (j = A_offd_i[i]; j < A_offd_i[i + 1]; j++)
      {
         rows_coo[k]   = ilower + (HYPRE_BigInt) i + base;
         cols[k]    = col_map_offd_A[A_offd_j[j]] + base;
         coefs[k++] = hypre_CSRMatrixData(A_offd)[j];
      }
   }
#endif

   hypre_assert(k == num_nonzeros);

   // Set pointers
   *nrows_ptr        = nrows;
   *num_nonzeros_ptr = num_nonzeros;
   *nnzrow_ptr       = nnzrow;
   *rows_ptr         = rows;
   *rows_coo_ptr     = rows_coo;
   *cols_ptr         = cols;
   *coefs_ptr        = coefs;

   return hypre_error_flag;
}


HYPRE_Real
checkMatrix(HYPRE_ParCSRMatrix h_parcsr_ref, HYPRE_IJMatrix ij_A)
{
   MPI_Comm            comm         = hypre_IJMatrixComm(ij_A);
   HYPRE_ParCSRMatrix  parcsr_A     = (HYPRE_ParCSRMatrix) hypre_IJMatrixObject(ij_A);
   HYPRE_ParCSRMatrix  h_parcsr_A;
   HYPRE_ParCSRMatrix  parcsr_error;
   HYPRE_Real          fnorm_err, fnorm_ref, rel_err;

   h_parcsr_A = hypre_ParCSRMatrixClone_v2(parcsr_A, 1, HYPRE_MEMORY_HOST);

   // Check norm of (parcsr_ref - parcsr_A)
   hypre_ParCSRMatrixAdd(1.0, h_parcsr_ref, -1.0, h_parcsr_A, &parcsr_error);
   fnorm_err = hypre_ParCSRMatrixFnorm(parcsr_error);
   fnorm_ref = hypre_ParCSRMatrixFnorm(h_parcsr_ref);
   rel_err = fnorm_err / fnorm_ref;

   hypre_ParPrintf(comm, "||A_ref - A||_F / ||A_ref||_F: %e\n", rel_err);

   HYPRE_ParCSRMatrixDestroy(h_parcsr_A);
   HYPRE_ParCSRMatrixDestroy(parcsr_error);

   return rel_err;
}

/* ---------------------------------- *
 * All test functions for assembly    *
 * option = 1: length of nrows,       *
 *        = 2: length of num_nonzeros *
 * ---------------------------------- */

/* set values */
HYPRE_Int
test_all(MPI_Comm             comm,
         const char          *test_name,
         HYPRE_MemoryLocation memory_location,
         HYPRE_Int            option,
         const char          *cmd_sequence,
         HYPRE_BigInt         ilower,
         HYPRE_BigInt         iupper,
         HYPRE_BigInt         jlower,
         HYPRE_BigInt         jupper,
         HYPRE_Int            nrows,
         HYPRE_BigInt         num_nonzeros,
         HYPRE_Int            nchunks,
         HYPRE_Int            init_alloc,
         HYPRE_Int            early_assemble,
         HYPRE_Real           grow_factor,
         HYPRE_Int           *h_nnzrow,
         HYPRE_Int           *nnzrow,
         HYPRE_BigInt        *rows,
         HYPRE_BigInt        *cols,
         HYPRE_Real          *coefs,
         HYPRE_IJMatrix      *ij_A_ptr)
{
   HYPRE_IJMatrix  ij_A;
   HYPRE_Int       i, j, chunk, chunk_size, chunk_nnz;
   HYPRE_Int       time_index;
   HYPRE_Int      *h_rowptr = hypre_CTAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_HOST);
   HYPRE_Int       cmd_len = (HYPRE_Int) strlen(cmd_sequence);
   HYPRE_Int       myid;

   hypre_MPI_Comm_rank(comm, &myid);

   for (i = 1; i < nrows + 1; i++)
   {
      h_rowptr[i] = h_rowptr[i - 1] + h_nnzrow[i - 1];
   }
   hypre_assert(h_rowptr[nrows] == num_nonzeros);

   HYPRE_IJMatrixCreate(comm, ilower, iupper, jlower, jupper, &ij_A);
   HYPRE_IJMatrixSetObjectType(ij_A, HYPRE_PARCSR);
   HYPRE_IJMatrixInitialize_v2(ij_A, memory_location);
   HYPRE_IJMatrixSetOMPFlag(ij_A, 1);
   grow_factor = myid ? grow_factor : 2 * grow_factor;
   if (init_alloc >= 0)
   {
      HYPRE_IJMatrixSetInitAllocation(ij_A, init_alloc);
   }
   HYPRE_IJMatrixSetEarlyAssemble(ij_A, early_assemble);
   if (grow_factor > 0)
   {
      HYPRE_IJMatrixSetGrowFactor(ij_A, grow_factor);
   }

   chunk_size = nrows / nchunks;

#if defined(HYPRE_USING_GPU) && !defined(HYPRE_TEST_USING_HOST)
   hypre_SyncDevice();
#if defined(CUDA_PROFILER)
   cudaProfilerStart();
#endif
#endif

   time_index = hypre_InitializeTiming(test_name);
   hypre_BeginTiming(time_index);

   for (j = 0; j < cmd_len; j++)
   {
      if (cmd_sequence[j] == 's' ||
          cmd_sequence[j] == 'a' ||
          cmd_sequence[j] == 'g')
      {
         for (chunk = 0; chunk < nrows; chunk += chunk_size)
         {
            chunk_size = hypre_min(chunk_size, nrows - chunk);
            if (1 == option)
            {
               if (cmd_sequence[j] == 's')
               {
                  HYPRE_IJMatrixSetValues(ij_A, chunk_size,
                                          &nnzrow[chunk],
                                          &rows[chunk],
                                          &cols[h_rowptr[chunk]],
                                          &coefs[h_rowptr[chunk]]);
               }
               else if (cmd_sequence[j] == 'a')
               {
                  HYPRE_IJMatrixAddToValues(ij_A, chunk_size,
                                            &nnzrow[chunk],
                                            &rows[chunk],
                                            &cols[h_rowptr[chunk]],
                                            &coefs[h_rowptr[chunk]]);
               }
               else /* if (cmd_sequence[j] == 'g') */
               {
                  HYPRE_IJMatrixGetValues(ij_A, chunk_size,
                                          &nnzrow[chunk],
                                          &rows[chunk],
                                          &cols[h_rowptr[chunk]],
                                          &coefs[h_rowptr[chunk]]);
               }
            }
            else
            {
               chunk_nnz = h_rowptr[chunk + chunk_size] - h_rowptr[chunk];
               if (cmd_sequence[j] == 's')
               {
                  HYPRE_IJMatrixSetValues(ij_A, chunk_nnz, NULL,
                                          &rows[h_rowptr[chunk]],
                                          &cols[h_rowptr[chunk]],
                                          &coefs[h_rowptr[chunk]]);
               }
               else if (cmd_sequence[j] == 'a')
               {
                  HYPRE_IJMatrixAddToValues(ij_A, chunk_nnz, NULL,
                                            &rows[h_rowptr[chunk]],
                                            &cols[h_rowptr[chunk]],
                                            &coefs[h_rowptr[chunk]]);
               }
               else /* if (cmd_sequence[j] == 'g') */
               {
                  HYPRE_IJMatrixGetValues(ij_A, chunk_nnz, NULL,
                                          &rows[h_rowptr[chunk]],
                                          &cols[h_rowptr[chunk]],
                                          &coefs[h_rowptr[chunk]]);
               }
            }
         }
      }
      else if (cmd_sequence[j] == 'A')
      {
         HYPRE_IJMatrixAssemble(ij_A);
      }
   }

#if defined(HYPRE_USING_GPU) && !defined(HYPRE_TEST_USING_HOST)
   hypre_SyncDevice();
#if defined(CUDA_PROFILER)
   cudaProfilerStop();
#endif
#endif

   // Finalize timer
   hypre_EndTiming(time_index);
   hypre_PrintTiming(test_name, hypre_MPI_COMM_WORLD);
   hypre_FinalizeTiming(time_index);
   hypre_ClearTiming();

   // Free memory
   hypre_TFree(h_rowptr, HYPRE_MEMORY_HOST);

   // Set pointer to matrix
   *ij_A_ptr = ij_A;

   return hypre_error_flag;
}
