#' @title MERINGUE: Moran's I based Spatially Variable Gene Detection
#'
#' @description
#' Detect spatially variable genes using the MERINGUE approach based on
#' Moran's I spatial autocorrelation statistic.
#'
#' @name CalSVG_MERINGUE
NULL


#' Detect SVGs using MERINGUE Method
#'
#' @description
#' Identifies spatially variable genes by computing Moran's I spatial
#' autocorrelation statistic for each gene. Genes with significant positive
#' spatial autocorrelation (similar expression values clustering together)
#' are identified as SVGs.
#'
#' @param expr_matrix Numeric matrix of gene expression values.
#'   \itemize{
#'     \item Rows: genes
#'     \item Columns: spatial locations (spots/cells)
#'     \item Values: normalized expression (e.g., log-transformed counts)
#'   }
#'   Row names should be gene identifiers; column names should match
#'   row names of \code{spatial_coords}.
#'
#' @param spatial_coords Numeric matrix of spatial coordinates.
#'   \itemize{
#'     \item Rows: spatial locations (must match columns of expr_matrix)
#'     \item Columns: coordinate dimensions (x, y, and optionally z)
#'   }
#'
#' @param network_method Character string specifying how to construct the
#'   spatial neighborhood network.
#'   \itemize{
#'     \item \code{"delaunay"} (default): Delaunay triangulation. Creates natural
#'       neighbors based on geometric triangulation. Good for relatively uniform
#'       spatial distributions.
#'     \item \code{"knn"}: K-nearest neighbors. Each spot connected to its k
#'       nearest neighbors. More robust for irregular distributions.
#'   }
#'
#' @param k Integer. Number of neighbors for KNN method. Default is 10.
#'   Ignored when \code{network_method = "delaunay"}.
#'   \itemize{
#'     \item Smaller k (e.g., 5-6): More local patterns, faster computation
#'     \item Larger k (e.g., 15-20): Broader patterns, smoother results
#'   }
#'
#' @param filter_dist Numeric or NA. Maximum Euclidean distance for neighbors.
#'   Pairs with distance > filter_dist are not considered neighbors.
#'   Default is NA (no filtering). Useful for:
#'   \itemize{
#'     \item Removing long-range spurious connections
#'     \item Focusing on local spatial patterns
#'   }
#'
#' @param alternative Character string specifying the alternative hypothesis
#'   for the Moran's I test.
#'   \itemize{
#'     \item \code{"greater"} (default): Test for positive autocorrelation
#'       (clustering of similar values). Most appropriate for SVG detection.
#'     \item \code{"less"}: Test for negative autocorrelation (dissimilar
#'       values as neighbors).
#'     \item \code{"two.sided"}: Test for any autocorrelation.
#'   }
#'
#' @param adjust_method Character string specifying p-value adjustment method
#'   for multiple testing correction. Passed to \code{p.adjust()}.
#'   Options include: "BH" (default, Benjamini-Hochberg), "bonferroni",
#'   "holm", "hochberg", "hommel", "BY", "fdr", "none".
#'
#' @param min_pct_cells Numeric (0-1). Minimum fraction of cells that must
#'   contribute to the spatial pattern for a gene to be retained as SVG.
#'   Default is 0.05 (5%). Uses LISA (Local Indicators of Spatial Association)
#'   to filter genes driven by only a few outlier cells.
#'   Set to 0 to disable this filter.
#'
#' @param n_threads Integer. Number of threads for parallel computation.
#'   Default is 1.
#'   \itemize{
#'     \item For large datasets: Set to number of available cores
#'     \item Uses R's parallel::mclapply (not available on Windows)
#'   }
#'
#' @param use_cpp Logical. Whether to use C++ implementation for faster
#'   computation. Default is TRUE. Falls back to R if C++ fails.
#'
#' @param verbose Logical. Whether to print progress messages. Default is TRUE.
#'
#' @return A data.frame with SVG detection results, sorted by significance.
#'   Columns:
#'   \itemize{
#'     \item \code{gene}: Gene identifier
#'     \item \code{observed}: Observed Moran's I statistic. Range: [-1, 1].
#'       Positive values indicate clustering, negative indicate dispersion.
#'     \item \code{expected}: Expected Moran's I under null (approximately -1/(n-1))
#'     \item \code{sd}: Standard deviation under null hypothesis
#'     \item \code{z_score}: Standardized test statistic (observed - expected) / sd
#'     \item \code{p.value}: Raw p-value from normal approximation
#'     \item \code{p.adj}: Adjusted p-value (multiple testing corrected)
#'   }
#'
#' @details
#' \strong{Method Overview:}
#'
#' MERINGUE uses Moran's I, a classic measure of spatial autocorrelation:
#' \deqn{I = \frac{n}{W} \frac{\sum_i \sum_j w_{ij}(x_i - \bar{x})(x_j - \bar{x})}{\sum_i (x_i - \bar{x})^2}}
#'
#' where:
#' \itemize{
#'   \item n = number of spatial locations
#'   \item W = sum of all spatial weights
#'   \item w_ij = spatial weight between locations i and j
#'   \item x_i = expression value at location i
#' }
#'
#' \strong{Interpretation:}
#' \itemize{
#'   \item I > 0: Positive autocorrelation (similar values cluster)
#'   \item I = 0: Random spatial distribution
#'   \item I < 0: Negative autocorrelation (checkerboard pattern)
#' }
#'
#' \strong{Statistical Testing:}
#' P-values are computed using normal approximation based on analytical
#' formulas for the expected value and variance of Moran's I under the
#' null hypothesis of complete spatial randomness.
#'
#' \strong{Computational Considerations:}
#' \itemize{
#'   \item Time complexity: O(n^2) for network construction, O(n*m) for testing
#'     (n = spots, m = genes)
#'   \item Memory: O(n^2) for storing spatial weights matrix
#'   \item For n > 10,000 spots, consider using KNN with small k
#' }
#'
#' @examples
#' # Load example data
#' data(example_svg_data)
#' expr <- example_svg_data$logcounts[1:20, ]  # Use subset for speed
#' coords <- example_svg_data$spatial_coords
#'
#' \donttest{
#' # Basic usage (requires RANN package for KNN)
#' if (requireNamespace("RANN", quietly = TRUE)) {
#'     results <- CalSVG_MERINGUE(expr, coords, 
#'                                network_method = "knn", k = 10,
#'                                verbose = FALSE)
#'     head(results)
#'
#'     # Get significant SVGs
#'     sig_genes <- results$gene[results$p.adj < 0.05]
#' }
#' }
#'
#' @references
#' \itemize{
#'   \item Miller, B.F. et al. (2021) Characterizing spatial gene expression
#'     heterogeneity in spatially resolved single-cell transcriptomic data
#'     with nonuniform cellular densities. Genome Research.
#'   \item Moran, P.A.P. (1950) Notes on Continuous Stochastic Phenomena.
#'     Biometrika.
#'   \item Cliff, A.D. and Ord, J.K. (1981) Spatial Processes: Models &
#'     Applications. Pion.
#' }
#'
#' @seealso
#' \code{\link{CalSVG}} for unified interface,
#' \code{\link{buildSpatialNetwork}} for network construction,
#' \code{\link{moranI_test}} for individual gene testing
#'
#' @export
CalSVG_MERINGUE <- function(expr_matrix,
                            spatial_coords,
                            network_method = c("delaunay", "knn"),
                            k = 10L,
                            filter_dist = NA,
                            alternative = c("greater", "less", "two.sided"),
                            adjust_method = "BH",
                            min_pct_cells = 0.05,
                            n_threads = 1L,
                            use_cpp = TRUE,
                            verbose = TRUE) {

    # Match arguments
    network_method <- match.arg(network_method)
    alternative <- match.arg(alternative)

    # =========================================================================
    # Input Validation
    # =========================================================================

    if (!is.matrix(expr_matrix)) {
        expr_matrix <- as.matrix(expr_matrix)
    }

    if (!is.matrix(spatial_coords)) {
        spatial_coords <- as.matrix(spatial_coords)
    }

    # Ensure matching samples
    if (is.null(colnames(expr_matrix))) {
        colnames(expr_matrix) <- paste0("spot_", seq_len(ncol(expr_matrix)))
    }
    if (is.null(rownames(spatial_coords))) {
        rownames(spatial_coords) <- colnames(expr_matrix)
    }

    common_samples <- intersect(colnames(expr_matrix), rownames(spatial_coords))
    if (length(common_samples) == 0) {
        stop("No matching samples between expr_matrix and spatial_coords")
    }

    expr_matrix <- expr_matrix[, common_samples, drop = FALSE]
    spatial_coords <- spatial_coords[common_samples, , drop = FALSE]

    n_genes <- nrow(expr_matrix)
    n_spots <- ncol(expr_matrix)

    if (verbose) {
        message("=== CalSVG_MERINGUE ===")
        message(sprintf("  Genes: %d", n_genes))
        message(sprintf("  Spots: %d", n_spots))
        message(sprintf("  Network: %s", network_method))
    }

    # =========================================================================
    # Build Spatial Network
    # =========================================================================

    if (verbose) message("Building spatial neighborhood network...")

    W <- buildSpatialNetwork(
        coords = spatial_coords,
        method = network_method,
        k = k,
        filter_dist = filter_dist,
        binary = TRUE,
        verbose = FALSE
    )

    # Ensure weight matrix matches expression matrix columns
    W <- W[common_samples, common_samples]

    if (verbose) {
        n_edges <- sum(W > 0) / 2
        avg_neighbors <- mean(rowSums(W))
        message(sprintf("  Network: %d edges, avg %.1f neighbors/spot",
                       n_edges, avg_neighbors))
    }

    # =========================================================================
    # Pre-filter Low Variance Genes (Performance Optimization)
    # =========================================================================

    gene_vars <- apply(expr_matrix, 1, var)
    low_var_genes <- gene_vars < 1e-10
    if (sum(low_var_genes) > 0 && verbose) {
        message(sprintf("  Skipping %d genes with zero variance", sum(low_var_genes)))
    }

    # =========================================================================
    # Row-standardize Weight Matrix (Precompute for all genes)
    # =========================================================================

    W_rs <- W
    rs <- rowSums(W_rs)
    rs[rs == 0] <- 1
    W_rs <- W_rs / rs

    # =========================================================================
    # Compute Moran's I for Each Gene (C++ Accelerated)
    # =========================================================================

    if (verbose) message("Computing Moran's I for each gene...")

    # Try C++ implementation first
    cpp_success <- FALSE
    if (use_cpp) {
        cpp_result <- tryCatch({
            # Use C++ batch computation
            moran_stats <- moranI_full_cpp(expr_matrix, W_rs)
            cpp_success <- TRUE
            moran_stats
        }, error = function(e) {
            if (verbose) message("  C++ failed, falling back to R implementation")
            NULL
        })
    }

    if (cpp_success) {
        # C++ succeeded - extract results
        observed <- cpp_result$observed
        expected <- cpp_result$expected
        sd_vals <- cpp_result$sd

        # Compute p-values
        p_vals <- switch(alternative,
            "greater" = 1 - pnorm(observed, mean = expected, sd = sd_vals),
            "less" = pnorm(observed, mean = expected, sd = sd_vals),
            "two.sided" = 2 * pmin(
                pnorm(observed, mean = expected, sd = sd_vals),
                1 - pnorm(observed, mean = expected, sd = sd_vals)
            )
        )

        results <- data.frame(
            gene = rownames(expr_matrix),
            observed = observed,
            expected = expected,
            sd = sd_vals,
            p.value = p_vals,
            stringsAsFactors = FALSE
        )

    } else {
        # Fall back to R implementation
        compute_moran_one <- function(gene_idx) {
            x <- expr_matrix[gene_idx, ]
            if (gene_vars[gene_idx] < 1e-10) {
                return(c(observed = NA, expected = NA, sd = NA, p.value = NA))
            }
            tryCatch({
                moranI_test(x, W_rs, alternative = alternative, standardize = FALSE)
            }, error = function(e) {
                c(observed = NA, expected = NA, sd = NA, p.value = NA)
            })
        }

        # Parallel or sequential computation
        if (n_threads > 1 && .Platform$OS.type != "windows") {
            results_list <- parallel::mclapply(
                seq_len(n_genes),
                compute_moran_one,
                mc.cores = n_threads
            )
        } else {
            if (verbose && n_genes > 100) {
                pb <- txtProgressBar(min = 0, max = n_genes, style = 3)
            }
            results_list <- lapply(seq_len(n_genes), function(i) {
                result <- compute_moran_one(i)
                if (verbose && n_genes > 100) setTxtProgressBar(pb, i)
                return(result)
            })
            if (verbose && n_genes > 100) close(pb)
        }

        results_matrix <- do.call(rbind, results_list)
        results <- data.frame(
            gene = rownames(expr_matrix),
            observed = results_matrix[, "observed"],
            expected = results_matrix[, "expected"],
            sd = results_matrix[, "sd"],
            p.value = results_matrix[, "p.value"],
            stringsAsFactors = FALSE
        )
    }

    # Calculate z-score
    results$z_score <- (results$observed - results$expected) / results$sd

    # Adjust p-values for multiple testing
    results$p.adj <- p.adjust(results$p.value, method = adjust_method)

    # Sort by p-value
    results <- results[order(results$p.value), ]
    rownames(results) <- NULL

    # =========================================================================
    # Summary Statistics
    # =========================================================================

    if (verbose) {
        n_sig_raw <- sum(results$p.value < 0.05, na.rm = TRUE)
        n_sig_adj <- sum(results$p.adj < 0.05, na.rm = TRUE)
        message(sprintf("  Significant genes (p < 0.05): %d raw, %d adjusted",
                       n_sig_raw, n_sig_adj))
        message("=== Done ===")
    }

    return(results)
}


#' Local Indicators of Spatial Association (LISA)
#'
#' @description
#' Computes LISA statistics for identifying local clusters and outliers.
#' Used to filter genes driven by only a few cells.
#'
#' @param x Numeric vector of values.
#' @param W Spatial weights matrix.
#' @param alternative Alternative hypothesis ("greater", "less", "two.sided").
#'
#' @return Data frame with LISA statistics for each location.
#'
#' @keywords internal
lisa_test <- function(x, W, alternative = "greater") {

    N <- length(x)

    # Row standardize weights
    rs <- rowSums(W)
    rs[rs == 0] <- 1
    W <- W / rs

    # Compute LISA
    z <- x - mean(x, na.rm = TRUE)
    s2 <- sum(z^2, na.rm = TRUE) / N
    lz <- apply(W, 1, function(w) sum(w * z))
    Ii <- (z / s2) * lz

    # Expected value
    Wi <- rowSums(W)
    E_Ii <- -Wi / (N - 1)

    # Variance
    b2 <- (sum(z^4, na.rm = TRUE) / N) / (s2^2)
    Wi2 <- apply(W, 1, function(w) sum(w^2))
    A <- (N - b2) / (N - 1)
    B <- (2 * b2 - N) / ((N - 1) * (N - 2))
    V_Ii <- A * Wi2 + B * (Wi^2 - Wi2) - E_Ii^2
    Sd_Ii <- sqrt(pmax(V_Ii, 0))

    # P-values
    pvals <- pnorm(Ii, mean = E_Ii, sd = Sd_Ii)

    if (alternative == "greater") {
        pvals <- 1 - pvals
    } else if (alternative == "two.sided") {
        pvals <- 2 * pmin(pvals, 1 - pvals)
    }

    return(data.frame(
        observed = Ii,
        expected = E_Ii,
        sd = Sd_Ii,
        p.value = pvals
    ))
}
