#' Calculate descriptive statistics for National Forest Inventory Data
#' 
#' @description
#' summary_nfi() is a function that calculates comprehensive descriptive statistics for National Forest Inventory (NFI) data.
#' It can provide summaries for individual plots, the entire study area, or specific groups within the study area using parameters `byplot` or `plotgrp`.
#' Users have flexibility in specifying data inclusion criteria and analysis levels using parameters `clusterplot`, `largetreearea`, `stockedland`, and `talltree`.
#' These parameters determine whether to treat cluster plots as single plots, to include large tree survey plots, and to focus only on Stocked land and tall trees.
#' 
#' @details
#' The function calculates the following statistics:
#' Plot-related statistics:
#' - Number of cluster plots
#' - Number of subplots
#' - Number of subplots with large trees (\eqn{\geq} 30cm) observed
#' 
#' Tree-related statistics:
#' - Number of individual trees
#' - Number of large trees
#' - Number of dominant trees
#' - Number of tree species
#' 
#' Tree measurements and stand statistics (mean and standard deviation):
#' - DBH (Diameter at Breast Height)
#' - Tree height
#' - Height of dominant trees
#' - Number of trees per hectare
#' - Basal area per hectare
#' - Volume per hectare
#' 
#' The `largetreearea` parameter affects calculations differently:
#' - For per-hectare statistics (trees per hectare, basal area per hectare, volume per hectare), setting `largetreearea = TRUE` includes data from large tree survey plots.
#' - For all other statistics, trees from large tree survey plots are always excluded, regardless of the `largetreearea` setting.
#' 
#' @param data : A `list` containing 'plot' and 'tree' data frames, typically generated by \code{\link{read_nfi}}.
#' @param byplot : A logical flag (default FALSE); if TRUE, calculates statistics for each plot separately. If FALSE, calculates for the entire dataset.
#' @param plotgrp : A character vector; specifies variables from 'plot' table to use for grouping. Use \code{c()} to combine multiple variables.
#' @param continuousplot : A logical flag (default TRUE); if TRUE, includes only plots that have been measured at the exact same location across all NFI cycles (5th, 6th, etc.). If FALSE, includes all plots regardless of location changes or missing cycle measurements.
#' @param clusterplot : A logical flag (default FALSE); if TRUE, treats each cluster plot as a single unit. If FALSE, calculates for each subplot separately.
#' @param largetreearea : A logical flag (default TRUE); if TRUE, includes large tree survey plots in the analysis. If FALSE, only uses standard tree plots.
#' @param stockedland : A logical flag (default TRUE); if TRUE, includes only stocked land. If FALSE, includes all land types.
#' @param talltree : A logical flag (default TRUE); if TRUE, includes only tall trees. If FALSE, includes both trees and shrubs.
#' 
#' @return 
#' A `data.frame` that includes summary statistics. 
#' The structure depends on the input parameters:
#' - If `byplot = TRUE`, each row represents a plot.
#' - If `byplot = FALSE`, each row represents the entire dataset or a group specified by `plotgrp`
#'  
#' @examples
#' 
#' data("nfi_donghae")
#' 
#' # Basic usage
#' summary_stats <- summary_nfi(nfi_donghae, continuousplot = TRUE)
#' 
#' # Summarize by the group, including all land types
#' grouped_stats <- summary_nfi(nfi_donghae, plotgrp = "OWN_CD", 
#'                               stockedland = FALSE, continuousplot = TRUE)
#' 
#' # Summarize by individual plots, including both trees and shrubs 
#' plot_summaries <- summary_nfi(nfi_donghae, byplot = TRUE, talltree = FALSE)
#' 
#' @export 



summary_nfi<- function(data, byplot=FALSE, plotgrp=NULL, continuousplot=FALSE, clusterplot=FALSE, largetreearea=TRUE, stockedland=TRUE, talltree=TRUE){
  
  ## error message-------------------------------------------------------------- 
  required_names <- c("plot", "tree")
  
  if (!all(required_names %in% names(data))) {
    missing_dfs <- required_names[!required_names %in% names(data)]
    stop("Missing required data frames in the list: ", paste(missing_dfs, collapse = ", "), call. = FALSE)
  }
  
  if (!is.null(plotgrp)){
    
    if(!is.character(plotgrp)) {
      stop("param 'plotgrp' must be 'character'")
    }
    if(any(!plotgrp %in% names(data$plot))){
      stop(paste0("param 'plotgrp': ", plotgrp," is not a column name in the 'plot' data frame."))
    }
  }
  
  
  if (clusterplot){
    if(!is.null(plotgrp) && plotgrp=="FORTYP_SUB"){
      stop("When the param 'clusterplot' is set to TRUE, param 'plotgrp' uses FORTYP_CLST (the forest type for the cluster plot) instead of FORTYP_SUB (the forest type for each subplot).")
    }
  }
  
  
  
  ## Preprocessing-------------------------------------------------------------- 
  if(clusterplot){
    plot_id <- c('CLST_PLOT')
  }else{
    plot_id <- c('SUB_PLOT')
  }
  
  if (stockedland){ 
    data <- filter_nfi(data, c("plot$LAND_USECD == '1'"))
  }
  
  if(talltree){
    data$tree <- data$tree %>% filter(WDY_PLNTS_TYP_CD == "1")
  }
  
  if(!largetreearea){ 
    data$tree <- data$tree %>% filter(LARGEP_TREE == "0")
  }
  
  if(continuousplot){

    data <- filter_nfi(data, 'plot$SUBPTYP != "\\uc704\\uce58\\ubcc0\\uacbd" | is.na(plot$SUBPTYP)')
    all_cycle <- unique(data$plot$CYCLE)
    samples_with_all_cycle <- data$plot %>%
      filter(!is.na(FORTYP_SUB)) %>%
      group_by(SUB_PLOT) %>%
      filter(all(all_cycle %in% CYCLE)) %>%
      distinct(SUB_PLOT) %>%
      pull(SUB_PLOT)
    
    data <- filter_nfi(data, c("plot$SUB_PLOT %in% samples_with_all_cycle"))

  }
  
  df <- left_join(data$tree[, c('CLST_PLOT', 'SUB_PLOT',"CYCLE",'INVYR', 'WDY_PLNTS_TYP_CD','CCL','CCLCD', 'SP',
                                'DBH', 'BASAL_AREA', 'HT_EST', 'VOL_EST', 'LARGEP_TREE')], 
                  data$plot[,c('CLST_PLOT', 'SUB_PLOT', "CYCLE", 'INVYR', 
                               "LAND_USE", "LAND_USECD", 'NONFR_INCL_AREA_LARGEP', 'NONFR_INCL_AREA_SUBP', plotgrp)], 
                  by = c("CLST_PLOT", "SUB_PLOT", "CYCLE", "INVYR"))
  
  
  plot_id  <- rlang::sym(plot_id)
  plotgrp  <- rlang::syms(plotgrp)
  
  df$tree_area <- 0.04 - ((df$NONFR_INCL_AREA_SUBP*10)/10000)
  df$largetree_area <- 0.08 - ((df$NONFR_INCL_AREA_LARGEP*10)/10000)
  if(!largetreearea){ 
    df$largetree <- 0
  }else{
    df$largetree <- ifelse(df$DBH>=30, 1, 0)
  }
  
  
  # statistics for plots and trees and tree measurements by site or plot
  
  if(!byplot){ # statistics for plots and trees and tree measurements per site
    
    stat_num_temp <- df %>% 
      group_by(CYCLE, !!!plotgrp) %>%
      summarise(num_clusterplot= n_distinct(CLST_PLOT, na.rm=TRUE),
                num_subplot= n_distinct(SUB_PLOT, na.rm=TRUE),
                num_dbh30_subplot= n_distinct(SUB_PLOT[largetree==1]), .groups = 'drop')
    
    stat_num_temp2 <- df %>% filter(LARGEP_TREE=="0") %>%
      group_by(CYCLE, !!!plotgrp) %>%
      summarise(num_tree = n(),
                num_largetree = sum(largetree, na.rm=TRUE),
                num_dominanttree = sum(CCLCD=="a", na.rm=TRUE), 
                num_species= n_distinct(SP), .groups = 'drop')
    
    stat_num <- full_join(stat_num_temp, stat_num_temp2, by=c('CYCLE', as.character(unlist(lapply(plotgrp, quo_name))))) 
    
    
    stat_mean <- df %>% filter(LARGEP_TREE=="0") %>%
      group_by(CYCLE, !!plot_id, !!!plotgrp) %>% 
      summarise(mean_DBH_temp = mean(DBH, na.rm=TRUE), 
                mean_H_temp = mean(HT_EST, na.rm=TRUE),
                mean_dominant_H_temp = mean(HT_EST[CCLCD=="a"], na.rm=TRUE),
                .groups = 'drop')
    
    
    stat_mean <- stat_mean %>% 
      group_by(CYCLE, !!!plotgrp) %>% 
      summarise(mean_DBH = mean(mean_DBH_temp, na.rm=TRUE), 
                sd_DBH = sd(mean_DBH_temp, na.rm=TRUE),
                mean_H = mean(mean_H_temp, na.rm=TRUE),
                sd_H = sd(mean_H_temp, na.rm=TRUE),
                mean_dominant_H = mean(mean_dominant_H_temp, na.rm=TRUE),
                sd_dominant_H = sd(mean_dominant_H_temp, na.rm=TRUE),
                .groups = 'drop')
    
    
    stat_temp <- full_join(stat_num, stat_mean, by=c('CYCLE', as.character(unlist(lapply(plotgrp, quo_name))))) 
    
    
  }else{ # statistics for plots and trees and tree measurements per plot
    
    
    stat_num <- df %>% filter(LARGEP_TREE=="0") %>%
      group_by(CYCLE, !!plot_id, INVYR, !!!plotgrp) %>%
      summarise(num_tree = n(),
                num_largetree = sum(largetree, na.rm=TRUE),
                num_dominanttree = sum(CCLCD=="a", na.rm=TRUE), 
                num_species= n_distinct(SP), .groups = 'drop')
    
    
    stat_mean <- df %>% filter(LARGEP_TREE=="0") %>%
      group_by(CYCLE, !!plot_id, INVYR, !!!plotgrp) %>% 
      summarise(mean_DBH = mean(DBH, na.rm=TRUE), 
                mean_H = mean(HT_EST, na.rm=TRUE),
                mean_dominant_H = mean(HT_EST[CCLCD=="a"], na.rm=TRUE),
                .groups = 'drop')
    
    
    stat_temp <- full_join(stat_num, stat_mean, by=c('CYCLE', quo_name(plot_id), 'INVYR', as.character(unlist(lapply(plotgrp, quo_name))))) ##plotgrp 확인
    
  }
  
  
  ## Statistics by cluster or subplot
  
  if(!largetreearea){ 
    df <- df %>% filter(LARGEP_TREE == "0")
  }
  
  if(clusterplot){ # Statistics by cluster
    
    
    plot_area <- df[-which(duplicated(df[c("SUB_PLOT", "CYCLE")])), c('CYCLE', 'INVYR', 'CLST_PLOT', 'SUB_PLOT', 'largetree_area', 'tree_area')]
    
    plot_area <- plot_area %>%
      group_by(CYCLE, !!plot_id,  INVYR) %>%
      summarise(largetree_area = sum(largetree_area, na.rm=TRUE),
                tree_area= sum(tree_area, na.rm=TRUE),.groups = 'drop')
    
    
    stat_ha <- df %>% 
      group_by(CYCLE, !!plot_id, INVYR, largetree, !!!plotgrp) %>% 
      summarise(tree_temp = n(), 
                basal_temp= sum(BASAL_AREA, na.rm=TRUE),
                volume_temp= sum(VOL_EST, na.rm=TRUE),
                .groups = 'drop')
    
    
    stat_ha <- full_join(stat_ha, plot_area, by=c('CYCLE', 'INVYR', quo_name(plot_id))) #, as.character(unlist(lapply(plotgrp, quo_name)))
    
    if(!largetreearea){ ## Statistics by cluster (Excluding largetree plots)
      
      
      condition_ha <- c("tree_n_ha","basal_m2_ha","volume_m3_ha")
      stat_ha[condition_ha] <-  NA
      stat_ha <- as.data.frame(stat_ha)
      
      condition_ha <- (names(stat_ha) %in% c("tree_n_ha","basal_m2_ha","volume_m3_ha"))
      condition <- (names(stat_ha) %in% c("tree_temp","basal_temp","volume_temp"))
      
      stat_ha[condition_ha] <- 
        lapply(stat_ha[condition], function(x) (x/stat_ha$tree_area))
      
      stat_ha[condition] <- NULL
      stat_ha$tree_area <- NULL
      stat_ha$largetree_area <- NULL
      stat_ha$largetree<- NULL

      
    }else{ ## Statistics by cluster (Including largetree plots)
      
      condition <- (names(stat_ha) %in% c("tree_temp","basal_temp","volume_temp"))
      stat_ha[condition] <- lapply(stat_ha[condition], function(x) ifelse(stat_ha$largetree == 1, 
                                                                          x/(stat_ha$largetree_area),
                                                                          x/(stat_ha$tree_area)))
      
      stat_ha <- stat_ha %>% 
        group_by(CYCLE, INVYR, !!plot_id, !!!plotgrp) %>% 
        summarise(tree_n_ha = sum(tree_temp, na.rm=TRUE),
                  basal_m2_ha = sum(basal_temp, na.rm=TRUE),
                  volume_m3_ha = sum(volume_temp, na.rm=TRUE),.groups = 'drop')
    }
    
    
  }else{ ## Statistics by subplot
    
    stat_ha <- df %>% 
      group_by(CYCLE, !!plot_id, INVYR, largetree, largetree_area, tree_area, !!!plotgrp) %>% 
      summarise(tree_temp = n(), 
                basal_temp= sum(BASAL_AREA, na.rm=TRUE),
                volume_temp= sum(VOL_EST, na.rm=TRUE),
                .groups = 'drop')
    

    if(!largetreearea){## Statistics by subplot (Excluding largetree plots)
      
      condition_ha <- c("tree_n_ha","basal_m2_ha","volume_m3_ha")
      stat_ha[condition_ha] <-  NA
      stat_ha <- as.data.frame(stat_ha)
      
      condition_ha <- (names(stat_ha) %in% c("tree_n_ha","basal_m2_ha","volume_m3_ha"))
      condition <- (names(stat_ha) %in% c("tree_temp","basal_temp","volume_temp"))
      
      stat_ha[condition_ha] <- 
        lapply(stat_ha[condition], function(x) (x/stat_ha$tree_area))
      
      
      stat_ha[condition] <- NULL
      stat_ha$tree_area <- NULL
      stat_ha$largetree_area <- NULL
      stat_ha$largetree<- NULL
      
      
      
    }else{ ## Statistics by subplot (Including largetree plots)
      
      condition <- (names(stat_ha) %in% c("tree_temp","basal_temp","volume_temp"))
      stat_ha[condition] <- lapply(stat_ha[condition], function(x) ifelse(stat_ha$largetree == 1, 
                                                                          x/(stat_ha$largetree_area),
                                                                          x/(stat_ha$tree_area)))
      
      stat_ha <- stat_ha %>% 
        group_by(CYCLE, INVYR, !!plot_id, !!!plotgrp) %>% 
        summarise(tree_n_ha = sum(tree_temp, na.rm=TRUE),
                  basal_m2_ha = sum(basal_temp, na.rm=TRUE),
                  volume_m3_ha = sum(volume_temp, na.rm=TRUE),.groups = 'drop')
    }
    
  }
  
  
  
  
  
  ##  join (statistics for plots and trees by site and Statistics by cluster or subplot) 
  if(!byplot){
    
    stat_ha <- stat_ha %>% 
      group_by(CYCLE, !!!plotgrp) %>% 
      summarise(mean_tree_n_ha = mean(tree_n_ha, na.rm=TRUE),
                se_tree_n_ha =  sd(tree_n_ha, na.rm=TRUE),
                mean_basal_m2_ha = mean(basal_m2_ha, na.rm=TRUE),
                se_basal_m2_ha =  sd(basal_m2_ha, na.rm=TRUE),
                mean_volume_m3_ha = mean(volume_m3_ha, na.rm=TRUE),
                se_volume_m3_ha =  sd(volume_m3_ha, na.rm=TRUE),.groups = 'drop')
    
    stat_data <- full_join(stat_temp, stat_ha, by=c('CYCLE', as.character(unlist(lapply(plotgrp, quo_name)))))
    
    
  }else{
    
    stat_data <- full_join(stat_temp, stat_ha, by=c('CYCLE', quo_name(plot_id), 'INVYR', as.character(unlist(lapply(plotgrp, quo_name))))) 
  }
  
  
  
  
  
  
  
  return(stat_data)
  
} 