
#' @title Extraction d'un bilan qualité
#'
#' @description
#' Permet d'extraire un bilan qualité à partir du fichier CSV contenant la
#' matrice des diagnostics.
#'
#' @param matrix_output_file Chaîne de caracère. Chemin vers le fichier CSV
#' contenant la matrice des diagnostics.
#' @param file Chaîne de caracère. Chemin vers le fichier CSV contenant la
#' matrice des diagnostics. Cet argument remplace l'argument
#' \code{matrix_output_file}.
#' @param sep séparateur de caractères utilisé dans le fichier csv (par défaut
#' \code{sep = ";"})
#' @param dec séparateur décimal utilisé dans le fichier csv (par défaut
#' \code{dec = ","})
#' @param thresholds \code{list} de vecteurs numériques. Seuils appliqués aux
#' différents tests afin de classer en modalités \code{Good}, \code{Uncertain},
#' \code{Bad} et \code{Severe}.
#' Par défault, la valeur de l'option \code{"jdc_threshold"} est utilisée.
#' Vous pouvez appeler la fonction \code{\link{get_thresholds}} pour voir à quoi
#' doit ressemble l'objet \code{thresholds}.
#'
#' @details La fonction permet d'extraire un bilan qualité à partir d'un
#' fichier csv contenant l'ensemble des
#' diagnostics (généralement fichier \emph{demetra_m.csv}).
#'
#' Ce fichier peut être obtenu en lançant le cruncher
#' (\code{\link[rjwsacruncher]{cruncher}} ou
#' \code{\link[rjwsacruncher]{cruncher_and_param}}) avec l'ensemble des
#' paramètres de base pour les paramètres à exporter et l'option
#' \code{csv_layout = "vtable"} (par défaut) pour le format de sortie des
#' fichiers csv (option de \code{\link[rjwsacruncher]{cruncher_and_param}} ou de
#' \code{\link[rjwsacruncher]{create_param_file}} lors de la création du fichier
#' de paramètres).
#'
#' Le résultat de cette fonction est un objet \code{\link{QR_matrix}} qui est
#' une liste de trois paramètres :
#'
#' * le paramètre \code{modalities} est un \code{data.frame} contenant un
#'   ensemble de variables sous forme catégorielle (Good, Uncertain, Bad,
#'   Severe).
#' * le paramètre \code{values} est un \code{data.frame} contenant les valeurs
#'   associées aux indicateurs présents dans \code{modalities} (i.e. :
#'   p-valeurs, statistiques, etc.) ainsi que des variables qui n'ont pas de
#'   modalité (fréquence de la série et modèle ARIMA).
#' * le paramètre \code{score_formula} est initié à \code{NULL} : il contiendra
#'   la formule utilisée pour calculer le score (si le calcul est fait).
#'
#' Si \code{x} est fourni, les arguments \code{fichier} et
#' \code{matrix_output_file} sont ignorés. L'argument \code{fichier} désigne
#' également le chemin vers le fichier qui contient la matrice de diagnostic
#' (qui peut être importée en parallèle dans R et utilisée avec l'argument
#' \code{x}).
#'
#' @encoding UTF-8
#' @return Un objet de type \code{\link{QR_matrix}}.
#' @examples
#' # Chemin menant au fichier demetra_m.csv
#' demetra_path <- file.path(
#'     system.file("extdata", package = "JDCruncheR"),
#'     "WS/ws_ipi/Output/SAProcessing-1",
#'     "demetra_m.csv"
#' )
#'
#' # Extraire le bilan qualité à partir du fichier demetra_m.csv
#' QR <- extract_QR(file = demetra_path)
#'
#' print(QR)
#'
#' # Extraire les modalités de la matrice
#' QR[["modalities"]]
#' # Or:
#' QR[["modalities"]]
#'
#' @keywords internal
#' @name fr-extract_QR
NULL
#> NULL


#' @title Extraction of a quality report
#'
#' @description
#' To extract a quality report from the csv file containing the diagnostics
#' matrix.
#'
#' @param matrix_output_file the csv file containing the diagnostics matrix.
#' @param file the csv file containing the diagnostics matrix. This argument
#' supersedes the argument \code{matrix_output_file}.
#' @param x data.frame containing the diagnostics matrix.
#' @param sep the separator used in the csv file (by default, \code{sep = ";"})
#' @param dec the decimal separator used in the csv file (by default,
#' \code{dec = ","})
#' @param thresholds \code{list} of numerical vectors. Thresholds applied to the
#' various tests in order to classify into modalities \code{Good},
#' \code{Uncertain}, \code{Bad} and \code{Severe}.
#' By default, the value of the \code{"jdc_threshold"} option is used.
#' You can call the \code{\link{get_thresholds}} function to see what the
#' \code{thresholds} object should look like.
#'
#' @details This function generates a quality report from a csv file containing
#' diagnostics (usually from the file \emph{demetra_m.csv}).
#' The \emph{demetra_m.csv} file can be generated by launching the cruncher
#' (functions \code{\link[rjwsacruncher]{cruncher}} or
#' \code{\link[rjwsacruncher]{cruncher_and_param}}) with the default export
#' parameters, having used the default option \code{csv_layout = "vtable"} to
#' format the output tables of the functions
#' \code{\link[rjwsacruncher]{cruncher_and_param}} and
#' \code{\link[rjwsacruncher]{create_param_file}} when creating the parameters
#' file.
#'
#' This function returns a \code{\link{QR_matrix}} object, which is a list of 3
#' objects:
#'
#' * \code{modalities}, a \code{data.frame} containing several indicators and
#'   their categorical quality (Good, Uncertain, Bad, Severe).
#' * \code{values}, a \code{data.frame} containing the same indicators and the
#'   values that lead to their quality category (i.e.: p-values, statistics,
#'   etc.) as well as additional variables that don't have a modality/quality
#'   (series frequency and arima model).
#' * \code{score_formula} that will store the formula used to calculate the
#'   score (when relevant). Its initial value is \code{NULL}.
#'
#' If \code{x} is supplied, the \code{file} and \code{matrix_output_file}
#' arguments are ignored. The \code{file} argument also designates the path to
#' the file containing the diagnostic matrix (which can be imported into R in
#' parallel and used with the \code{x} argument).
#'
#' @encoding UTF-8
#'
#' @return a \code{\link{QR_matrix}} object.
#'
#' @family QR_matrix functions
#' @examples
#' # Path of matrix demetra_m
#' demetra_path <- file.path(
#'     system.file("extdata", package = "JDCruncheR"),
#'     "WS/ws_ipi/Output/SAProcessing-1",
#'     "demetra_m.csv"
#' )
#'
#' # Extract the quality report from the demetra_m file
#' QR <- extract_QR(file = demetra_path)
#'
#' print(QR)
#'
#' # Extract the modalities matrix:
#' QR[["modalities"]]
#' # Or:
#' QR[["modalities"]]
#'
#' @importFrom stats sd
#' @importFrom utils read.csv
#' @seealso [Traduction française][fr-extract_QR()]
#' @export
extract_QR <- function(file,
                       x,
                       matrix_output_file,
                       sep = ";",
                       dec = ",",
                       thresholds = getOption("jdc_thresholds")) {
    if (!missing(matrix_output_file)) {
        warning("The `matrix_output_file` argument is deprecated",
                " and will be removed in the future. ",
                "Please use the `file` argument instead or ",
                "the `x` argument which may contain a diagnostic matrix ",
                "that has already been imported.")
        file <- matrix_output_file
    }

    if (missing(x) && missing(file)) {
        stop("Please call extract_QR() on a csv file containing at least ",
             "one cruncher output matrix (demetra_m.csv for example) ",
             "with the argument `file` ",
             "or directly on a matrix with the argument `x`")
    } else if (missing(x)) {
        if (length(file) == 0L
            || !file.exists(file)
            || !endsWith(x = file, suffix = ".csv")) {
            stop("The chosen file desn't exist or isn't a csv file")
        }

        demetra_m <- read.csv(
            file = file,
            sep = sep,
            dec = dec,
            stringsAsFactors = FALSE,
            na.strings = c("NA", "?"),
            fileEncoding = "latin1",
            quote = ""
        )
    } else {
        demetra_m <- x
    }

    if (nrow(demetra_m) == 0L || ncol(demetra_m) == 0L) {
        stop("The chosen csv file is empty")
    }

    series <- gsub(
        "(^ *)|(* $)", "",
        gsub("(^.* \\* )|(\\[frozen\\])", "", demetra_m[, 1L])
    )

    stat_Q <- extractStatQ(demetra_m, thresholds)
    stat_OOS <- extractOOS_test(demetra_m, thresholds)
    normality <- extractNormalityTests(demetra_m, thresholds)
    outliers <- extractOutliers(demetra_m, thresholds)
    test <- extractTest(demetra_m, thresholds)

    QR_modalities <- data.frame(
        series = series,
        normality[["modalities"]],
        test[["modalities"]],
        stat_OOS[["modalities"]],
        stat_Q[["modalities"]],
        outliers[["modalities"]]
    )
    QR_values <- data.frame(
        series = series,
        normality[["values"]],
        test[["values"]],
        stat_OOS[["values"]],
        stat_Q[["values"]],
        outliers[["values"]],
        frequency = extractFrequency(demetra_m),
        arima_model = extractARIMA(demetra_m)
    )

    QR <- QR_matrix(modalities = QR_modalities, values = QR_values)
    return(QR)
}

extractFrequency <- function(demetra_m) {
    if (anyNA(match(c("start", "end", "n"), colnames(demetra_m)))) {
        stop("Error in the extraction of the series frequency ",
             "(missing either the start date, ",
             "the end date or the number of observations)")
    }
    start <- as.Date(demetra_m[["start"]], format = "%Y-%m-%d")
    end <- as.Date(demetra_m[["end"]], format = "%Y-%m-%d")
    n <- demetra_m[["n"]]

    start <- data.frame(
        y = as.numeric(format(start, "%Y")),
        m = as.numeric(format(start, "%m"))
    )
    end <- data.frame(
        y = as.numeric(format(end, "%Y")),
        m = as.numeric(format(end, "%m"))
    )
    freq <- c(12L, 6L, 4L, 3L, 2L)
    nobs_compute <- matrix(
        data = sapply(
            X = freq,
            FUN = function(x) {
                x * (end[, 1L] - start[, 1L]) + (end[, 2L] - start[, 2L]) / (12. / x)
            }
        ),
        nrow = nrow(demetra_m)
    )
    output <- vapply(
        X = seq_len(nrow(nobs_compute)),
        FUN = function(i) {
            freq[which((nobs_compute[i, ] == n[i])
                       | (nobs_compute[i, ] + 1L == n[i])
                       | (nobs_compute[i, ] - 1L == n[i]))[[1L]]]
        },
        FUN.VALUE = integer(1L)
    )
    return(output)
}

extractARIMA <- function(demetra_m) {
    q_possibles <- grep("(^q$)|(^q\\.\\d$)", colnames(demetra_m))
    bp_possibles <- grep("(^bp$)|(^bp\\.\\d$)", colnames(demetra_m))

    if (length(q_possibles) > 1L) {
        val_q <- demetra_m[, q_possibles]
        integer_col <- which(vapply(val_q, is.integer, FUN.VALUE = logical(1L)))
        if (length(integer_col) == 0L) {
            val_q <- rep(NA_integer_, nrow(demetra_m))
        } else if (length(integer_col) == 1L) {
            val_q <- val_q[, integer_col[[1L]]]
        }
    } else if (length(q_possibles) == 1L) {
        val_q <- demetra_m[, q_possibles]
    } else  {
        stop("Error in the extraction of the arima order q: multiple column.")
    }

    if (length(bp_possibles) > 1L) {
        val_bp <- demetra_m[, bp_possibles]
        integer_col <- which(vapply(val_bp, is.integer, FUN.VALUE = logical(1L)))
        if (length(integer_col) == 0L) {
            val_bp <- rep(NA_integer_, nrow(demetra_m))
        } else {
            val_bp <- val_bp[, integer_col[[1L]]]
        }
    } else if (length(bp_possibles) == 1L) {
        val_bp <- demetra_m[, bp_possibles]
    } else  {
        stop("Error in the extraction of the arima order bp: multiple column.")
    }

    if (!all(
        is.integer(val_q) || all(is.na(val_q)),
        is.integer(val_bp) || all(is.na(val_bp))
    )) {
        stop("Error in the extraction of the arima order q or bp")
    }
    arima <- data.frame(
        arima_p = demetra_m[, "p"],
        arima_d = demetra_m[, "d"],
        arima_q = val_q,
        arima_bp = val_bp,
        arima_bd = demetra_m[, "bd"],
        arima_bq = demetra_m[, "bq"]
    )
    arima[["arima_model"]] <- paste0(
        "(", arima[["arima_p"]], ",", arima[["arima_d"]], ",", arima[["arima_q"]], ")",
        "(", arima[["arima_bp"]], ",", arima[["arima_bd"]], ",", arima[["arima_bq"]], ")"
    )
    return(arima[["arima_model"]])
}

extractStatQ <- function(demetra_m, thresholds = getOption("jdc_thresholds")) {

    col_q <- q_possibles <- grep("(^q$)|(^q\\.\\d$)",
                                 colnames(demetra_m))
    col_q_m2 <- q_m2_possibles <- grep("(^q\\.m2$)|(^q\\.m2\\.\\d$)",
                                       colnames(demetra_m))

    if (length(q_possibles) > 1L) {
        col_q_possibles <- demetra_m[, q_possibles]
        NA_cols <- which(unlist(lapply(
            X = col_q_possibles,
            FUN = function(x) all(is.na(x))
        )))
        num_cols <- which(unlist(lapply(
            X = col_q_possibles,
            FUN = function(x) !all(is.integer(x) | is.character(x) | is.na(x))
        )))

        if (length(num_cols) > 1L) {
            stop("Error in the extraction of the Q stats: multiple columns found")
        } else if (length(num_cols) == 1L) {
            col_q <- q_possibles[num_cols]
        } else if (length(NA_cols) > 0L) {
            col_q <- q_possibles[NA_cols[[1L]]]
        } else {
            stop("Error in the extraction of the Q stats")
        }
    }

    if (length(q_m2_possibles) > 1L) {
        col_q_m2_possibles <- demetra_m[, q_m2_possibles]
        NA_cols <- which(unlist(lapply(
            X = col_q_m2_possibles,
            FUN = function(x) all(is.na(x))
        )))
        num_cols <- which(unlist(lapply(
            X = col_q_m2_possibles,
            FUN = function(x) !all(is.integer(x) | is.character(x) | is.na(x))
        )))

        if (length(num_cols) > 1L) {
            stop("Error in the extraction of the Q stats: multiple colum found")
        } else if (length(num_cols) == 1L) {
            col_q_m2 <- q_m2_possibles[num_cols]
        } else if (length(NA_cols) > 0L) {
            col_q_m2 <- q_m2_possibles[NA_cols[[1L]]]
        } else {
            stop("Error in the extraction of the Q-M2 stats")
        }
    }

    stat_Q_modalities <- data.frame(
        q = cut(
            x = as.numeric(demetra_m[, col_q]),
            breaks = c(-Inf, thresholds[["q"]]),
            labels = names(thresholds[["q"]]),
            right = FALSE,
            include.lowest = TRUE,
            ordered_result = TRUE
        ),
        q_m2 = cut(
            x = as.numeric(demetra_m[, col_q_m2]),
            breaks = c(-Inf, thresholds[["q_m2"]]),
            labels = names(thresholds[["q_m2"]]),
            right = FALSE,
            include.lowest = TRUE,
            ordered_result = TRUE
        ),
        stringsAsFactors = FALSE
    )
    stat_Q_values <- data.frame(
        q = demetra_m[, col_q],
        q_m2 = demetra_m[, col_q_m2],
        stringsAsFactors = FALSE
    )

    return(list(modalities = stat_Q_modalities, values = stat_Q_values))
}

extractOOS_test <- function(demetra_m, thresholds = getOption("jdc_thresholds")) {

    col_mean <- mean_possibles <- grep("(^mean$)|(^mean\\.\\d$)", colnames(demetra_m))

    if (length(mean_possibles) > 1L) {
        col_mean_possibles <- demetra_m[, mean_possibles]
        NA_cols <- which(unlist(lapply(
            X = col_mean_possibles,
            FUN = function(x) all(is.na(x))
        )))
        num_cols <- which(unlist(lapply(
            X = col_mean_possibles,
            FUN = function(x) !all(is.integer(x) | is.character(x) | is.na(x))
        )))

        if (length(num_cols) > 1L) {
            stop("Error in the extraction of the mean in the out of sample diagnostics: multiple column")
        } else if (length(num_cols) == 1L) {
            col_mean <- mean_possibles[num_cols]
        } else if (length(NA_cols) > 0L) {
            col_mean <- mean_possibles[NA_cols[[1L]]]
        } else {
            stop("Error in the extraction of the mean in the out of sample diagnostics")
        }

    }

    col_mse <- match("mse", colnames(demetra_m))
    if (all(is.na(col_mse))) {
        val_mse <- rep(NA_real_, nrow(demetra_m))
    } else if (length(col_mse) == 1L) {
        val_mse <- demetra_m[, col_mse]
        if (is.character(val_mse)) {
            col_mse <- col_mse + 1L
            val_mse <- demetra_m[, col_mse]
        }
    } else {
        stop("Error in the extraction of the mse in the out of sample diagnostics: multiple column")
    }

    stat_OOS_modalities <- data.frame(
        oos_mean = cut(
            x = as.numeric(demetra_m[, col_mean]),
            breaks = c(-Inf, thresholds[["oos_mean"]]),
            labels = names(thresholds[["oos_mean"]]),
            right = FALSE,
            include.lowest = TRUE,
            ordered_result = TRUE
        ),
        oos_mse = cut(
            x = as.numeric(val_mse),
            breaks = c(-Inf, thresholds[["oos_mse"]]),
            labels = names(thresholds[["oos_mse"]]),
            right = FALSE,
            include.lowest = TRUE,
            ordered_result = TRUE
        ),
        stringsAsFactors = FALSE
    )

    stat_OOS_values <- data.frame(
        oos_mean = as.numeric(demetra_m[, col_mean]),
        oos_mse = as.numeric(val_mse),
        stringsAsFactors = FALSE
    )

    return(list(modalities = stat_OOS_modalities, values = stat_OOS_values))
}

extractNormalityTests <- function(demetra_m, thresholds = getOption("jdc_thresholds")) {
    tests_possibles <- grep("(^skewness$)|(^kurtosis$)|(^lb2$)", colnames(demetra_m))
    if (length(tests_possibles) < 3L) {
        stop(
            "At least one test is missing, among: skewness, kurtosis, lb2",
            "Re-compute the cruncher export with the options:",
            " residuals.skewness:3, residuals.kurtosis:3 and residuals.lb2:3"
        )
    } else if (length(tests_possibles) > 3L) {
        stop("There are several variables with the same name ",
             "(among skewness, kurtosis, lb2)")
    }

    if (is.character(demetra_m[["skewness"]])
        && is.character(demetra_m[["kurtosis"]])
        && is.character(demetra_m[["lb2"]])) {

        next_index <- rep(tests_possibles, each = 2L) + rep(seq_len(2L), 3L)
        next_colnames <- colnames(demetra_m)[next_index]

        if (length(grep(pattern = "^X\\.(\\d){1,}$", x = next_colnames, fixed = FALSE)) != 6L) {
            stop("Re-compute the cruncher export with the options:",
                 " residuals.skewness:3, residuals.kurtosis:3 and residuals.lb2:3")
        }
        skewness_pvalue <- demetra_m[, tests_possibles[[1L]] + 2L]
        kurtosis_pvalue <- demetra_m[, tests_possibles[[2L]] + 2L]
        homoskedasticity_pvalue <- demetra_m[, tests_possibles[[3L]] + 2L]

    } else if (is.numeric(demetra_m[["skewness"]])
               && is.numeric(demetra_m[["kurtosis"]])
               && is.numeric(demetra_m[["lb2"]])) {
        if (length(grep(pattern = "^X\\.(\\d){1,}$", x = colnames(demetra_m)[tests_possibles + 1L], fixed = FALSE)) != 3L) {
            stop("Re-compute the cruncher export with the options:",
                 " residuals.skewness:2, residuals.kurtosis:2 and residuals.lb2:2")
        }
        skewness_pvalue <- demetra_m[, tests_possibles[[1L]] + 1L]
        kurtosis_pvalue <- demetra_m[, tests_possibles[[2L]] + 1L]
        homoskedasticity_pvalue <- demetra_m[, tests_possibles[[3L]] + 1L]
    } else {
        stop("the matrix has wrong format.")
    }

    normality_modalities <- data.frame(
        residuals_homoskedasticity = cut(
            x = skewness_pvalue,
            breaks = c(-Inf, thresholds[["residuals_normality"]]),
            labels = names(thresholds[["residuals_normality"]]),
            right = FALSE,
            include.lowest = TRUE,
            ordered_result = TRUE
        ),
        residuals_skewness = cut(
            x = skewness_pvalue,
            breaks = c(-Inf, thresholds[["residuals_skewness"]]),
            labels = names(thresholds[["residuals_skewness"]]),
            right = FALSE,
            include.lowest = TRUE,
            ordered_result = TRUE
        ),
        residuals_kurtosis = cut(
            x = skewness_pvalue,
            breaks = c(-Inf, thresholds[["residuals_kurtosis"]]),
            labels = names(thresholds[["residuals_kurtosis"]]),
            right = FALSE,
            include.lowest = TRUE,
            ordered_result = TRUE
        )
    )
    normality_values <- data.frame(
        residuals_homoskedasticity = homoskedasticity_pvalue,
        residuals_skewness = skewness_pvalue,
        residuals_kurtosis = kurtosis_pvalue
    )

    return(list(modalities = normality_modalities, values = normality_values))
}

extractOutliers <- function(demetra_m, thresholds = getOption("jdc_thresholds")) {

    if (all(c("n", "nout") %in% colnames(demetra_m))) {
        pct_outliers_value <- 100.0 * as.integer(demetra_m[["nout"]]) / as.integer(demetra_m[["n"]])
    } else {
        warning(
            "The following variables are missing from the diagnostics matrix:\n",
            toString(c("span.n", "regression.nout")),
            "\n\nPlease re-compute the export."
        )
        pct_outliers_value <- rep(NA_real_, nrow(demetra_m))
    }

    m7 <- as.numeric(demetra_m[["m7"]])

    outliers_modalities <- data.frame(
        m7 = cut(
            x = m7,
            breaks = c(-Inf, thresholds[["m7"]]),
            labels = names(thresholds[["m7"]]),
            right = FALSE,
            include.lowest = TRUE,
            ordered_result = TRUE
        ),
        pct_outliers = cut(
            x = pct_outliers_value,
            breaks = c(-Inf, thresholds[["pct_outliers"]]),
            labels = names(thresholds[["pct_outliers"]]),
            right = FALSE,
            include.lowest = TRUE,
            ordered_result = TRUE
        )
    )
    outliers_values <- data.frame(
        m7 = m7,
        pct_outliers = pct_outliers_value
    )

    return(list(modalities = outliers_modalities, values = outliers_values))
}

extractTest <- function(demetra_m, thresholds = getOption("jdc_thresholds")) {

    tests_variables <- c(
        "seas.sa.qs", "seas.sa.f",
        "seas.i.qs", "seas.i.f",
        "td.sa.last", "td.i.last",
        "independence", "normality"
    )

    present_variables <- intersect(tests_variables, colnames(demetra_m))
    index_present_variables <- match(present_variables, tests_variables)
    missing_variables <- setdiff(tests_variables, colnames(demetra_m))
    if (length(missing_variables) > 0L) {
        warning(
            "The following variables are missing from the diagnostics matrix:\n",
            toString(missing_variables),
            "\n\nPlease re-compute the export."
        )
    }

    index_variables <- match(present_variables, colnames(demetra_m)) + 1L

    test_values <- demetra_m[, index_variables, drop = FALSE]
    colnames(test_values) <- c(
        "qs_residual_sa_on_sa", "f_residual_sa_on_sa",
        "qs_residual_sa_on_i", "f_residual_sa_on_i",
        "f_residual_td_on_sa", "f_residual_td_on_i",
        "residuals_independency", "residuals_normality"
    )[index_present_variables]

    test_modalities <- as.data.frame(lapply(
        X = colnames(test_values),
        FUN = function(series_name) {
            cut(
                x = as.numeric(test_values[[series_name]]),
                breaks = c(-Inf, thresholds[[series_name]]),
                labels = names(thresholds[[series_name]]),
                right = FALSE,
                include.lowest = TRUE,
                ordered_result = TRUE
            )
        }
    ))
    colnames(test_modalities) <- colnames(test_values)

    return(list(modalities = test_modalities, values = test_values))
}
