## Copyright (C) 2026 Andreas Bertsatos <abertsatos@biol.uoa.gr>
## Copyright (C) 2026 Avanish Salunke <avanishsalunke16@gmail.com>
##
## This file is part of the statistics package for GNU Octave.
##
## This program is free software; you can redistribute it and/or modify it under
## the terms of the GNU General Public License as published by the Free Software
## Foundation; either version 3 of the License, or (at your option) any later
## version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.

## -*- texinfo -*-
## @deftypefn  {statistics} {@var{terms} =} parseWilkinsonFormula (@var{formula})
## @deftypefnx {statistics} {@var{result} =} parseWilkinsonFormula (@var{formula}, @var{mode})
## @deftypefnx {statistics} {[@var{X}, @var{y}, @var{names}] =} parseWilkinsonFormula (@var{formula}, "model_matrix", @var{data})
##
## Parse and expand statistical model formulae using the Wilkinson notation.
##
## This function implements the recursive-descent parser and expansion logic
## described by Wilkinson & Rogers (1973) for factorial models.  It allows the
## symbolic specification of analysis of variance and regression models,
## converting strings into computational schemas or design matrices.  It also
## supports multi-variable response specification on the Left-Hand Side (LHS)
## using lists or ranges.
##
## @code{parseWilkinsonFormula} accepts as its first input argument a Wilkinson
## notation string specified by @var{formula} either as a character vector or a
## string scalar with the following list of valid symbols:
##
## @itemize
## @item @strong{LHS (Response) Operators:}
##
## @itemize
## @item @code{,} : List separator for selecting multiple responses.
##
## @item @code{-} : Range operator for selecting multiple responses.
## @end itemize
##
## @item @strong{RHS (Model) Operators:}
##
## @itemize
## @item @code{+} : Term addition (Union of terms).
##
## @item @code{-} : Term deletion (Difference of terms).
##
## @item @code{*} : Crossing (Expands to Main Effects + Interaction).
##
## @item @code{/} : Nesting (Hierarchical relationship).
##
## @item @code{:} : Direct interaction.
##
## @item @code{^} : Crossing expansion limit.
## @end itemize
## @end itemize
##
## @code{parseWilkinsonFormula (@var{formula}, @var{mode})} further specifies
## how to process the Wilkinson notation specified by @var{formula}.  @var{mode}
## must be a character vector or a string scalar with any of the following
## acceptable values.
##
## @itemize
## @item @qcode{'expand'} (default) : Returns a cell array of character vectors
## containing the expanded model terms (e.g., @code{@{"A", "B", "A:B"@}}).
##
## @item @qcode{'matrix'} : Returns a schema structure containing a binary
## matrix defining term membership.
##
## @item @qcode{'model_matrix'} : Constructs the full Design Matrix (@var{X})
## and Response Matrix (@var{y}) based on the provided @var{data}.  Uses
## corner-point (reference) coding for categoricals.
##
## @item @qcode{'parse'} : Returns the raw Abstract Syntax Tree (AST).
##
## @item @qcode{'tokenize'} : Returns the list of tokens generated by the lexer
## (useful only for debugging).
## @end itemize
##
## @code{[@var{X}, @var{y}, @var{names}] = parseWilkinsonFormula @
## (@var{formula}, "model_matrix", @var{data})} will also accept a structure or
## a table containing the data variables.  Required only when @var{mode} is
## @code{"model_matrix"}.
## @itemize
## @item Field names must match variables in the formula.
## @item Response variables (LHS) must be numeric.
## @item Rows containing @code{NaN} are automatically removed.
## @end itemize
##
## @strong{Outputs}
## @table @var
## @item terms/result
## The processed model structure depending on the selected @var{mode}.
## @item X
## The numeric design matrix (observations x parameters).
## @item y
## The response matrix (observations x K).
## @item names
## A cell array of column names corresponding to @var{X}.
## @end table
##
## @strong{References}
##
## Wilkinson, G. N. and Rogers, C. E. (1973). Symbolic Description of Factorial
## Models for Analysis of Variance. Applied Statistics, 22, 392-399.
##
## @end deftypefn

function varargout = parseWilkinsonFormula (varargin)

  if (nargin < 1)
    error ("parseWilkinsonFormula: Input formula string is required.");
  elseif (nargin > 3)
    error ("parseWilkinsonFormula: Too many input arguments.");
  endif

  formula_str = varargin{1};
  if (nargin > 1)
    mode = varargin{2};
  else
    mode = "expand";
  endif

  mode = lower (mode);

  if (strcmp (mode, "tokenize"))
    varargout{1} = run_lexer (formula_str);
    return;
  endif

  if (! strcmp (mode, "model_matrix"))
    tokens = run_lexer (formula_str);
    [tree, curr] = run_parser (tokens);

    ## Check for Nested Tildes.
    if (isstruct (tree) && strcmp (tree.type, "OPERATOR") &&
                           strcmp (tree.value, "~"))
      if ((! isempty (tree.left) && isstruct (tree.left) &&
           strcmp (tree.left.type, "OPERATOR") &&
           strcmp (tree.left.value, "~")) ||
          (! isempty (tree.right) && isstruct (tree.right) &&
           strcmp (tree.right.type, "OPERATOR") &&
           strcmp (tree.right.value, "~")))
        error ("parseWilkinsonFormula: Unexpected token");
      endif
    endif

    ## Check for trailing.
    if (curr <= length (tokens) && ! strcmp (tokens(curr).type, "EOF"))
      error ("parseWilkinsonFormula: Unexpected token");
    endif
  else
    tree = [];
  endif

  ## Mode specific Processing
  switch (mode)
    case "parse"
      varargout{1} = tree;

    case "expand"
      varargout{1} = run_expander (tree);

    case "matrix"
      expanded = run_expander (tree);
      varargout{1} = run_schema_builder (expanded);

    case "model_matrix"
      ## Compile with Data
      if (nargin < 3)
        error (strcat ("parseWilkinsonFormula: 'model_matrix'", ...
                       " mode requires a Data Table."));
      endif
      data_table = varargin{3};

      ## splitting manually.
      tilde_idx = strfind (formula_str, "~");

      if (! isempty (tilde_idx))
        lhs_str = strtrim (formula_str(1:tilde_idx(1)-1));
        rhs_str = strtrim (formula_str(tilde_idx(1)+1:end));

        ## parse RHS.
        rhs_tokens = run_lexer (rhs_str);
        [rhs_tree, ~] = run_parser (rhs_tokens);

        wrapper.type = "OPERATOR";
        wrapper.value = "~";
        wrapper.left = [];
        wrapper.right = rhs_tree;

        expanded = run_expander (wrapper);
        schema = run_schema_builder (expanded);

        ## Resolve LHS variables manually.
        schema.ResponseVars = resolve_lhs_vars (lhs_str, data_table);
      else
        tokens = run_lexer (formula_str);
        [tree, ~] = run_parser (tokens);

        if (! strcmp (tree.type, "OPERATOR") || ! strcmp (tree.value, "~"))
          wrapper.type = "OPERATOR";
          wrapper.value = "~";
          wrapper.left = [];
          wrapper.right = tree;
          tree = wrapper;
        endif

        expanded = run_expander (tree);
        schema = run_schema_builder (expanded);
        schema.ResponseVars = {};
      endif

      [X, y, names] = run_model_matrix_builder (schema, data_table);

      varargout{1} = X;
      if (nargout > 1), varargout{2} = y; endif
      if (nargout > 2), varargout{3} = names; endif

    otherwise
      error ("parseWilkinsonFormula: Unknown mode: %s", mode);
  endswitch

endfunction

## lexer
function tokens = run_lexer (formula_str)

  if (isempty (formula_str))
    tokens = struct ("type", {}, "value", {}, "pos", {});
    return;
  endif

  str = char (formula_str);
  n = length (str);
  tokens(n) = struct ("type", "", "value", "", "pos", 0);
  tok_idx = 0;
  i = 1;

  while (i <= n)
    c = str(i);
    start_pos = i;

    if (isspace (c))
      i = i + 1;
      continue;
    endif

    if (c == ',')
      tok_idx = tok_idx + 1;
      tokens(tok_idx) = create_token ("COMMA", ",", start_pos);
      i++;
      continue;
    endif

    ## Identifiers (Factors)
    if (isletter (c))
      val = c;
      i = i + 1;
      while (i <= n)
        next_c = str(i);
        if (isletter (next_c) || (next_c >= '0' && next_c <= '9') ...
            || next_c == '_')
          val = [val, next_c];
          i = i + 1;
        else
          break;
        endif
      endwhile
      tok_idx = tok_idx + 1;
      tokens(tok_idx) = create_token ("IDENTIFIER", val, start_pos);
      continue;
    endif

    ## Numbers
    if (c >= '0' && c <= '9')
      val = c;
      i = i + 1;
      while (i <= n)
        next_c = str(i);
        if (next_c >= '0' && next_c <= '9')
          val = [val, next_c];
          i = i + 1;
        elseif (next_c == '.')
          ## Simple float support
          if (i < n && str(i+1) >= '0' && str(i+1) <= '9')
            val = [val, next_c];
            i = i + 1;
          else
            break;
          endif
        else
          break;
        endif
      endwhile
      tok_idx = tok_idx + 1;
      tokens(tok_idx) = create_token ("NUMBER", val, start_pos);
      continue;
    endif

    ## Operators
    type = "";
    val = c;
    skip = 0;

    switch (c)
      case '-'
        if (i < n)
          if (i < n && str(i+1) == '/')
            type = "OP_MINUS_MARGIN"; val = "-/"; skip = 1;
          elseif (i < n && str(i+1) == '*')
            type = "OP_MINUS_CLEAN"; val = "-*"; skip = 1;
          else
            type = "OP_MINUS";
          endif
        else
          type = "OP_MINUS";
        endif
      case '*'
        type = "OP_CROSS";
      case '^'
        type = "OP_POWER";
      case '/'
        type = "OP_NEST";
      case '+'
        type = "OP_PLUS";
      case {'.', ':'}
        type = "OP_DOT";
      case '~'
        type = "SEPARATOR";
      case '('
        type = "LPAREN";
      case ')'
        type = "RPAREN";
      otherwise
        error (strcat ("parseWilkinsonFormula: Unexpected", ...
                       " character '%s' at position %d", c, i));
    endswitch

    tok_idx = tok_idx + 1;
    tokens(tok_idx) = create_token (type, val, start_pos);
    i = i + 1 + skip;
  endwhile

  tokens = tokens (1:tok_idx);
  tokens(end+1) = create_token ("EOF", "EOF", i);

endfunction

function t = create_token (type, val, pos)
  t.type = type;
  t.value = val;
  t.pos = pos;
endfunction

function [tree, curr] = run_parser (tokens, curr, prec_limit)

  if (nargin < 2), curr = 1; endif
  if (nargin < 3), prec_limit = 0; endif

  n = length (tokens);
  if (curr > n)
    error ("parseWilkinsonFormula: Unexpected End Of Formula.");
  endif

  t = tokens(curr);
  curr = curr + 1;

  ## Basic Units
  if (strcmp (t.type, "IDENTIFIER"))
    ## Check for Function Call: IDENTIFIER followed by LPAREN
    if (curr <= n && strcmp (tokens(curr).type, "LPAREN"))
      func_name = t.value;
      curr = curr + 1;

      args = {};

      ## Parse arguments
      if (curr <= n && ! strcmp (tokens(curr).type, "RPAREN"))
        while (true)
          [arg_node, curr] = run_parser (tokens, curr, 0);
          args{end+1} = arg_node;

          if (curr <= n && strcmp (tokens(curr).type, "COMMA"))
            curr = curr + 1;
          else
            break;
          endif
        endwhile
      endif

      if (curr <= n && strcmp (tokens(curr).type, "RPAREN"))
        curr = curr + 1;
      else
        error (strcat ("parseWilkinsonFormula: Missing ')'", ...
                       " for function call '%s'."), func_name);
      endif

      tree.type = "FUNCTION";
      tree.name = func_name;
      tree.args = args;
      tree.left = [];
      tree.right = [];

    else
      ## Standard Variable
      tree.type = t.type;
      tree.value = t.value;
      tree.left = [];
      tree.right = [];
    endif

  elseif (strcmp (t.type, "NUMBER"))
    tree.type = t.type;
    tree.value = t.value;
    tree.left = [];
    tree.right = [];

  elseif (strcmp (t.type, "SEPARATOR"))
    ## Handle unary '~'
    tree.type = "OPERATOR";
    tree.value = t.value;
    tree.left = [];
    ## Recursively parse the RHS with precedence 5.
    [tree.right, curr] = run_parser (tokens, curr, 5);

  elseif (strcmp (t.type, "LPAREN"))
    [tree, curr] = run_parser (tokens, curr, 0);
    if (curr <= n && strcmp (tokens(curr).type, "RPAREN"))
      curr = curr + 1;
    else
      error ("parseWilkinsonFormula: Mismatched Parentheses. Missing ')'.");
    endif

  elseif (strcmp (t.type, "EOF"))
    error ("parseWilkinsonFormula: Unexpected End Of Formula.");
  else
    error ("parseWilkinsonFormula: Syntax Error. Unexpected token: '%s'", t.value);
  endif

  ## Operator Handling
  while (curr <= n)
    op_type = tokens(curr).type;
    op_prec = 0;

    ## precedences
    if (strcmp (op_type, "OP_POWER"))
      op_prec = 60;
    elseif (strcmp (op_type, "OP_DOT"))
      op_prec = 50;
    elseif (strcmp (op_type, "OP_NEST"))
      op_prec = 40;
    elseif (strcmp (op_type, "OP_CROSS"))
      op_prec = 30;
    elseif (strcmp (op_type, "OP_PLUS"))
      op_prec = 20;
    elseif (strncmp (op_type, "OP_MINUS", 8))
      op_prec = 10;
    elseif (strcmp (op_type, "SEPARATOR"))
      op_prec = 5;
    else
      break;
    endif

    if (op_prec <= prec_limit)
      break;
    endif

    op_val = tokens(curr).value;
    curr = curr + 1;

    [right, curr] = run_parser (tokens, curr, op_prec);

    new_node.type = "OPERATOR";
    new_node.value = op_val;
    new_node.left = tree;
    new_node.right = right;
    tree = new_node;
  endwhile

endfunction

## expander
function result = run_expander (node)

  if (isempty (node))
    result = {};
    return;
  endif

  ## Terminals
  if (strcmp (node.type, "IDENTIFIER"))
    result = {{node.value}};
    return;
  elseif (strcmp (node.type, "NUMBER"))
    ## "1" implies intercept term.
    if (strcmp (node.value, "1"))
      result = {{}};
    else
      result = {{node.value}};
    endif
    return;
  endif

  if (strcmp (node.type, "OPERATOR"))
    ## Formula Separator
    if (strcmp (node.value, "~"))
      result.response = run_expander (node.left);

      ## Implicit Intercept Logic
      add_intercept = true;
      if (! isempty (node.right) && strcmp (node.right.type, "OPERATOR") ...
          && (strcmp (node.right.value, "-") ...
          || strcmp (node.right.value, "-/") ...
          || strcmp (node.right.value, "-*")))

        if (! isempty (node.right.right) ...
            && strcmp (node.right.right.type, "NUMBER") ...
            && strcmp (node.right.right.value, "1"))
          add_intercept = false;
        endif
      endif

      model_raw = run_expander (node.right);

      if (add_intercept)
        result.model = list_union ({{}}, model_raw);
      else
        result.model = model_raw;
      endif
      return;
    endif

    lhs = run_expander (node.left);
    rhs = run_expander (node.right);

    switch (node.value)
      case "+"
        result = list_union (lhs, rhs);

      case {".", ":"}
        result = list_product (lhs, rhs);

      case "*"
        interaction = list_product (lhs, rhs);
        step1 = list_union (lhs, rhs);
        result = list_union (step1, interaction);

      case "^"
        base_terms = run_expander (node.left);
        if (! strcmp (node.right.type, "NUMBER"))
          error ("parseWilkinsonFormula: Exponent must be a number.");
        endif

        power_val = str2double (node.right.value);
        result = base_terms;

        ## Repeatedly apply Crossing.
        for k = 2:power_val
           interaction = list_product (result, base_terms);
           step1 = list_union (result, base_terms);
           result = list_union (step1, interaction);
        endfor

      case "/"
        max_L = get_maximal_terms (lhs);
        interaction = list_product (max_L, rhs);
        result = list_union (lhs, interaction);

      case "-"  ## Simple Deletion
        result = list_difference (lhs, rhs, "exact");

      case "-*" ## Delete term + Higher Order Interactions
        result = list_difference (lhs, rhs, "clean");

      case "-/" ## Delete terms where T is marginal
        result = list_difference (lhs, rhs, "margin");

      otherwise
        error ("parseWilkinsonFormula: Unknown operator '%s'", node.value);
    endswitch
    return;
  endif

  error ("parseWilkinsonFormula: Corrupt Tree.");

endfunction

## set operations.
function C = list_union (A, B)
  raw_list = [A, B];
  C = simplify_term_list (raw_list);
endfunction

function C = list_product (A, B)
  C = {};
  idx = 1;
  for i = 1:length (A)
    for j = 1:length (B)
      ## Dot product merges factor sets:
      new_term = union (A{i}, B{j});
      C{idx} = new_term;
      idx = idx + 1;
    endfor
  endfor
  C = simplify_term_list (C);
endfunction

function C = list_difference (S, T, mode)
  if (isempty (S)), C = {}; return; endif
  if (isempty (T)), C = S; return; endif

  C = {};
  strS = terms_to_strings (S);
  strT = terms_to_strings (T);

  keep_mask = true (size (S));

  for i = 1:length (S)
    term_s = S{i};
    s_str = strS{i};

    for j = 1:length (T)
      term_t = T{j};
      t_str = strT{j};

      match = false;

      switch (mode)
        case "exact"
          if (strcmp (s_str, t_str))
            match = true;
          endif

        case "clean" ## Delete T and any S where T is a subset of S
          if (strcmp (s_str, t_str) || is_subset (term_t, term_s))
            match = true;
          endif

        case "margin" ## Delete S where T is subset of S (but not T itself)
          if (! strcmp (s_str, t_str) && is_subset (term_t, term_s))
            match = true;
          endif
      endswitch

      if (match)
        keep_mask(i) = false;
        break;
      endif
    endfor
  endfor

  C = S(keep_mask);
endfunction

function fac = get_fac (term_list)
  all_factors = {};
  for i = 1:length (term_list)
    all_factors = union (all_factors, term_list{i});
  endfor
  fac = {all_factors};
endfunction

function is_sub = is_subset (small_set, large_set)
  is_sub = all (ismember (small_set, large_set));
endfunction

function clean_list = simplify_term_list (raw_list)
  if (isempty (raw_list))
    clean_list = {}; return;
  endif
  str_sigs = terms_to_strings (raw_list);
  [~, unique_idx] = unique (str_sigs);
  clean_list = raw_list(sort (unique_idx));
endfunction

function strs = terms_to_strings (term_list)
  strs = cell (size (term_list));
  for i = 1:length (term_list)
    if (isempty (term_list{i}))
      strs{i} = "1";
    else
      sorted_factors = sort (term_list{i});
      strs{i} = strjoin (sorted_factors, ":");
    endif
  endfor
endfunction

## schema builder
function schema = run_schema_builder (expanded)

  ## Handle struct vs cell
  if (isstruct (expanded))
    if (isfield (expanded, "model"))
      rhs_terms = expanded.model;
    else
      rhs_terms = expanded.rhs;
    endif
    if (isfield (expanded, "response"))
      lhs_term = expanded.response;
    else
      lhs_term = expanded.lhs;
    endif
  else
    rhs_terms = expanded;
    lhs_term = {};
  endif

  function out = flatten_recursive (in_val)
    out = {};
    if (ischar (in_val) || isstring (in_val))
      out = {char(in_val)};
    elseif (iscell (in_val))
      for k = 1:numel (in_val)
        out = [out, flatten_recursive(in_val{k})];
      endfor
    endif
  endfunction

  ## extract variables
  all_vars = {};
  if (! isempty (lhs_term))
    all_vars = [all_vars, flatten_recursive(lhs_term)];
  endif

  cleaned_rhs = cell (length (rhs_terms), 1);
  for i = 1:length (rhs_terms)
    term_vars = flatten_recursive (rhs_terms{i});
    final_term_vars = {};
    for j = 1:length (term_vars)
      parts = strsplit (term_vars{j}, ":");
      final_term_vars = [final_term_vars, parts];
    endfor
    cleaned_rhs{i} = final_term_vars;
    all_vars = [all_vars, final_term_vars];
  endfor

  all_vars = unique (all_vars);
  ## Remove intercept marker from var list
  all_vars(strcmp (all_vars, "1")) = [];

  schema.VariableNames = all_vars;

  ## Identify Response
  schema.ResponseIdx = [];
  if (! isempty (lhs_term))
    flat_lhs = flatten_recursive (lhs_term);
    if (! isempty (flat_lhs))
      [found, idx] = ismember (flat_lhs{1}, all_vars);
      if (found), schema.ResponseIdx = idx; endif
    endif
  endif

  ## Build terms matrix
  n_vars = length (all_vars);
  n_terms = length (cleaned_rhs);
  terms_mat = zeros (n_terms, n_vars);

  for i = 1:n_terms
    vars_in_this_term = cleaned_rhs{i};

    ## Check for intercept term.
    if (isempty (vars_in_this_term) ||
        (length (vars_in_this_term) == 1 && strcmp (vars_in_this_term{1}, "1")))
      continue;
    endif

    [found, idx] = ismember (vars_in_this_term, all_vars);
    if (any (! found))
      error ("parseWilkinsonFormula: Unknown variable in term definition.");
    endif
    terms_mat(i, idx) = 1;
  endfor

  ## sorting : order by order.
  term_orders = sum (terms_mat, 2);
  M = [term_orders, terms_mat];

  [~, unique_idx] = unique (M, "rows");
  terms_mat = terms_mat (unique_idx, :);

  [~, sort_idx] = sortrows ([sum(terms_mat, 2), terms_mat]);
  schema.Terms = terms_mat (sort_idx, :);

endfunction

## model matrix builder.
function [X, y, col_names] = run_model_matrix_builder (schema, data)

  req_vars = schema.VariableNames;

  is_table_input = isa (data, "table");

  if (is_table_input)
    ## table:
    get_var_names = @() data.Properties.VariableNames;
    check_has_var = @(name) ismember (name, data.Properties.VariableNames);
  else
    ## struct:
    get_var_names = @() fieldnames (data);
    check_has_var = @(name) isfield (data, name);
  endif

  ## Data validation & masking
  if (isempty (req_vars))
    fnames = get_var_names ();
    n_total = length (data.(fnames{1}));
    valid_mask = true (n_total, 1);
  else
    if (! check_has_var (req_vars{1}))
      error ("parseWilkinsonFormula: Unknown variable '%s' in Data Table.", ...
             req_vars{1});
    endif

    n_total = length (data.(req_vars{1}));
    valid_mask = true (n_total, 1);
    for i = 1:length (req_vars)
      col = data.(req_vars{i});
      if (isnumeric (col))
        valid_mask = valid_mask & ! isnan (col);
      endif
    endfor
  endif

  if (! isempty (schema.ResponseIdx))
    y_name = req_vars{schema.ResponseIdx};
    if (! check_has_var (y_name))
      error ("parseWilkinsonFormula: Unknown variable '%s' in Data Table.", ...
             y_name);
    endif

    y_col = data.(y_name);
    if (isnumeric (y_col))
      valid_mask = valid_mask & ! isnan (y_col);
    endif
  endif

  if (isfield (schema, "ResponseVars") && ! isempty (schema.ResponseVars))
    for k = 1:length (schema.ResponseVars)
      y_name = schema.ResponseVars{k};
      if (check_has_var (y_name))
        col = data.(y_name);
        if (isnumeric (col))
          valid_mask = valid_mask & ! isnan (col);
        endif
      endif
    endfor
  endif

  n_rows = sum (valid_mask);

  ## Process predictors
  var_info = struct ();
  for i = 1:length (req_vars)
    vname = req_vars{i};
    raw = data.(vname);

    if (iscell (raw)), raw = raw(valid_mask);
    else, raw = raw(valid_mask, :); endif

    if (isnumeric (raw))
      var_info.(vname).type = "numeric";
      var_info.(vname).data = raw;
    else
      if (! iscellstr (raw) && ! isstring (raw))
        raw = cellstr (raw);
      endif
      [u, ~, idx] = unique (raw);
      var_info.(vname).type = "categorical";
      var_info.(vname).levels = u;
      var_info.(vname).indices = idx;
    endif
  endfor

  ## Build Design Matrix X
  X = [];
  col_names = {};

  ## Check for intercept term.
  intercept_row_idx = find (sum (schema.Terms, 2) == 0);
  has_intercept = ! isempty (intercept_row_idx);

  n_terms = size (schema.Terms, 1);

  for i = 1:n_terms
    term_row = schema.Terms(i, :);
    vars_idx = find (term_row);

    ## Intercept Term
    if (isempty (vars_idx))
      X = [X, ones(n_rows, 1)];
      col_names = [col_names; "(Intercept)"];
      continue;
    endif

    current_block = ones (n_rows, 1);
    current_names = {""};

    for v = vars_idx
      vname = req_vars{v};
      info = var_info.(vname);

      if (strcmp (info.type, "numeric"))
        current_block = current_block .* info.data;
        for k = 1:length (current_names)
          if (isempty (current_names{k}))
            current_names{k} = vname;
          else
            current_names{k} = [current_names{k}, ":", vname];
          endif
        endfor
      else
        ## Categorical
        n_lev = length (info.levels);

        ## Drop first level if intercept exists to avoid rank deficiency
        if (has_intercept)
          start_lev = 2;
          n_cols = n_lev - 1;
        else
          start_lev = 1;
          n_cols = n_lev;
        endif

        dummies = zeros (n_rows, n_cols);
        dum_names = {};

        for L = start_lev:n_lev
          col_idx = L - start_lev + 1;
          dummies(:, col_idx) = (info.indices == L);
          dum_names = [dum_names; ...
                       sprintf("%s_%s", vname, char (info.levels{L}))];
        endfor

        ## Cartesian product of current block and new dummies
        next_block = [];
        next_names = {};

        for c1 = 1:size (current_block, 2)
          for c2 = 1:size (dummies, 2)
            next_block = [next_block, current_block(:, c1) .* dummies(:, c2)];

            n1 = current_names{c1};
            n2 = dum_names{c2};

            if (isempty (n1))
              next_names = [next_names; n2];
            else
              next_names = [next_names; [n1, ":", n2]];
            endif
          endfor
        endfor
        current_block = next_block;
        current_names = next_names;
      endif
    endfor

    X = [X, current_block];
    col_names = [col_names; current_names];
  endfor

  ## Extract Response
  y = [];
  if (isfield (schema, "ResponseVars") && ! isempty (schema.ResponseVars))
    y_vars = schema.ResponseVars;
    y = zeros (n_rows, length (y_vars));

    for k = 1:length (y_vars)
      y_name = y_vars{k};
      raw_y = data.(y_name);

      if (iscell (raw_y))
          col_data = raw_y(valid_mask);
          try
            col_data = cell2mat (col_data);
          catch
            error (strcat ("parseWilkinsonFormula: Response", ...
                           " variable '%s' must be numeric."), y_name);
          end_try_catch
      else
          col_data = raw_y(valid_mask, :);
      endif

      if (! isnumeric (col_data))
        error (strcat ("parseWilkinsonFormula: Response", ...
                       " variable '%s' must be numeric"), y_name);
      endif

      if (size (col_data, 1) != n_rows)
        error (strcat ("parseWilkinsonFormula: Mismatch in number", ...
                       " of rows for response variable '%s'"), y_name);
      endif

      y(:, k) = col_data;
    endfor

  ## fallback to previous.
  elseif (! isempty (schema.ResponseIdx))
    y_name = req_vars{schema.ResponseIdx};
    raw_y = data.(y_name);
    if (iscell (raw_y)), y = raw_y(valid_mask);
    else, y = raw_y(valid_mask, :);
    endif
  endif

endfunction

function max_terms = get_maximal_terms (term_list)
  n = length (term_list);
  if (n == 0), max_terms = {}; return; endif

  is_max = true (1, n);
  for i = 1:n
    for j = 1:n
      if (i == j), continue; endif
      ## If term 'i' is a subset of 'j', it is NOT maximal
      if (is_subset (term_list{i}, term_list{j}))
        is_max(i) = false;
        break;
      endif
    endfor
  endfor
  max_terms = term_list(is_max);
endfunction

function vars = resolve_lhs_vars (lhs_str, data)
  if (isa (data, "table"))
    all_names = data.Properties.VariableNames;
  elseif (isstruct (data))
    all_names = fieldnames (data);
    all_names = all_names(:)';
  else
    error ("parseWilkinsonFormula: Data must be a table or struct.");
  endif

  vars = {};
  if (isempty (lhs_str)), return; endif

  parts = strsplit (lhs_str, ",");

  for i = 1:length (parts)
    p = strtrim (parts{i});
    if (isempty (p)), continue; endif

    ## check for the range.
    range_parts = strsplit (p, "-");

    if (length (range_parts) == 2)
      start_var = strtrim (range_parts{1});
      end_var   = strtrim (range_parts{2});

      [found_s, idx_s] = ismember (start_var, all_names);
      [found_e, idx_e] = ismember (end_var, all_names);

      if (! found_s)
        error ("parseWilkinsonFormula: Unknown variable '%s' in range", ...
               start_var);
      endif
      if (! found_e)
        error ("parseWilkinsonFormula: Unknown variable '%s' in range", ...
               end_var);
      endif

      ## Slice names.
      if (idx_s <= idx_e)
        range_vars = all_names(idx_s:idx_e);
      else
        range_vars = all_names(idx_e:idx_s);
      endif
      vars = [vars, range_vars];

    elseif (length (range_parts) == 1)
      ## Single Variable
      if (! any (strcmp (all_names, p)))
        error (strcat ("parseWilkinsonFormula: Response", ...
                       " variable '%s' not found in Data."), p);
      endif
      vars = [vars, {p}];
    else
      error ("parseWilkinsonFormula: Invalid syntax in response term '%s'", p);
    endif
  endfor

  vars = unique (vars, "stable");
endfunction

%!demo
%! ## Demo : Tokenizer Mode
%! ## Inspects the raw tokens generated from a formula string.
%! formula = "y ~ A * (B + c)";
%! tokens = parseWilkinsonFormula (formula, "tokenize");
%! display (tokens);

%!demo
%! ## Demo : Parser Mode (AST generation)
%! ## Returns the Abstract Syntax Tree (AST) structure.
%! formula = "A / B";
%! tree = parseWilkinsonFormula (formula, "parse");
%! display (tree);

%!demo
%! ## Demo : Expansion Mode (Crossings)
%! ## Demonstrates standard Wilkinson expansion for interactions.
%! formula = "A * B * C";
%! terms = parseWilkinsonFormula (formula, "expand");
%! disp (terms);

%!demo
%! ## Demo : Expansion Mode (Nesting)
%! ## Demonstrates hierarchical nesting logic.
%! formula = "Block / Plot / Subplot";
%! terms = parseWilkinsonFormula (formula, "expand");
%! disp (terms);

%!demo
%! ## Demo : Matrix Schema Mode
%! ## Generates the binary terms matrix (Row = Term, Col = Variable).
%! formula = "y ~ Age + Height + Age:Height";
%! schema = parseWilkinsonFormula (formula, "matrix");
%! disp (schema.VariableNames);
%! disp (schema.Terms);

%!demo
%! ## Demo : Model Matrix (Regression / Continuous)
%! ## Builds the Design Matrix (X) and Response (y) for numeric data.
%! d_reg.BP = [120; 122; 128; 130; 125];
%! d_reg.Age = [25; 30; 35; 40; 32];
%! d_reg.Weight = [70; 75; 80; 85; 78];
%! [X, y, names] = parseWilkinsonFormula ("BP ~ Age * Weight", "model_matrix", d_reg);
%! disp (names);
%! disp (X);

%!demo
%! ## Demo : Model Matrix (ANOVA / Categorical)
%! ## Automatically handles categorical variables (dummy coding).
%! d_cat.Yield = [10; 12; 15; 14; 11; 13];
%! d_cat.Variety = {"A"; "A"; "B"; "B"; "C"; "C"};
%! [X, y, names] = parseWilkinsonFormula ("Yield ~ Variety", "model_matrix", d_cat);
%! disp (names);
%! disp (X);

%!demo
%! ## Demo : Model Matrix (Mixed Numeric & Categorical)
%! ## Demonstrates Analysis of Covariance (ANCOVA) structures.
%! d_mix.Growth = [1.2; 1.4; 1.1; 1.8];
%! d_mix.Fertilizer = {"Old"; "Old"; "New"; "New"};
%! d_mix.Dose = [10; 20; 10; 20];
%! [X, ~, names] = parseWilkinsonFormula ("Growth ~ Fertilizer * Dose", "model_matrix", d_mix);
%! disp (names);
%! disp (X);

%!demo
%! ## Demo : Multi-Response
%! ## Selects specific response variables using comma.
%! d_list = struct ();
%! d_list.Yield_A = [10; 12; 11; 14];
%! d_list.Yield_B = [20; 22; 21; 24];
%! d_list.Rain    = [100; 110; 105; 120];
%! formula = "Yield_A, Yield_B ~ Rain";
%! [X, y, names] = parseWilkinsonFormula (formula, "model_matrix", d_list);
%! disp (names);
%! disp (y);
%! disp (X);

%!demo
%! ## Demo : Multi-Response
%! ## Selects a contiguous range of variables using the hyphen.
%! d_rng.Y_Jan = rand (4, 1);
%! d_rng.Y_Feb = rand (4, 1);
%! d_rng.Y_Mar = rand (4, 1);
%! d_rng.Trt   = {"A"; "B"; "A"; "B"};
%! formula = "Y_Jan - Y_Mar ~ Trt";
%! [X, y, names] = parseWilkinsonFormula (formula, "model_matrix", d_rng);
%! disp (names);
%! disp (y);
%! disp (X);

%!test
%! ## Test : Identifiers with numbers and underscores
%! tokens = parseWilkinsonFormula ("Yield ~ Var_1 + A2_B", "tokenize");
%! vals = {tokens.value};
%! assert (vals, {"Yield", "~", "Var_1", "+", "A2_B", "EOF"});
%!test
%! ## Test : Floating point numbers
%! tokens = parseWilkinsonFormula ("y ~ 0.5 * A", "tokenize");
%! vals = {tokens.value};
%! assert (vals, {"y", "~", "0.5", "*", "A", "EOF"});
%!test
%! ## Test : Whitespace insensitivity
%! t1 = parseWilkinsonFormula ("A*B", "tokenize");
%! t2 = parseWilkinsonFormula ("A   * B", "tokenize");
%! assert ({t1.value}, {t2.value});
%!test
%! ## Test : Precedence
%! t = parseWilkinsonFormula ("A + B * C . D", "expand");
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! assert (sort (terms), sort ({"A", "B", "C:D", "B:C:D"}));
%!test
%! ## Test : Parentheses Override
%! t = parseWilkinsonFormula ("(A + B) . C", "expand");
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! assert (sort (terms), sort ({"A:C", "B:C"}));
%!test
%! ## Test : Crossing Operator (*)
%! t = parseWilkinsonFormula ("A * B", "expand");
%! assert (length (t), 3);
%! t3 = parseWilkinsonFormula ("A * B * C", "expand");
%! assert (length (t3), 7);
%!test
%! ## Test : Nesting Operator (/)
%! t = parseWilkinsonFormula ("Field / Plot", "expand");
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! assert (sort (terms), sort ({"Field", "Field:Plot"}));
%!test
%! ## Test : Multi-level Nesting
%! t = parseWilkinsonFormula ("Block / Plot / Subplot", "expand");
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! assert (sort (terms), sort ({"Block", "Block:Plot", "Block:Plot:Subplot"}));
%!test
%! ## Test : Interaction Operator (.)
%! t = parseWilkinsonFormula ("A . B", "expand");
%! assert (length (t), 1);
%! assert (t{1}, {"A", "B"});
%!test
%! ## Test : Power operator on cube.
%! t = parseWilkinsonFormula ("(A + B + C)^3", "expand");
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! expected = sort ({"A", "B", "C", "A:B", "A:C", "B:C", "A:B:C"});
%! assert (sort (terms), expected);
%!test
%! ## Test : Power Operator.
%! t = parseWilkinsonFormula ("(A + B + C)^2", "expand");
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! assert (! ismember ("A:B:C", terms));
%! assert (ismember ("A:B", terms));
%!test
%! ## Test : Redundancy Check
%! t1 = parseWilkinsonFormula ("A + A", "expand");
%! assert (length (t1), 1);
%! t2 = parseWilkinsonFormula ("A * A", "expand");
%! assert (length (t2), 1);
%!test
%! ## Test : Deletion - Exact (-)
%! t = parseWilkinsonFormula ("A * B - A", "expand");
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! assert (sort (terms), sort ({"B", "A:B"}));
%!test
%! ## Test : Deletion - Clean (-*)
%! t = parseWilkinsonFormula ("A * B -* A", "expand");
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! assert (sort (terms), {"B"});
%!test
%! ## Test : Deletion - Marginal (-/)
%! t = parseWilkinsonFormula ("A * B -/ A", "expand");
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! assert (sort (terms), sort ({"A", "B"}));
%!test
%! ## Test : Deletion - Complex Sequence
%! t = parseWilkinsonFormula ("A*B*C - A:B:C", "expand");
%! assert (length (t), 6);
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! assert (! ismember ("A:B:C", terms));
%! assert (ismember ("A:B", terms));
%!test
%! ## Test : LHS and RHS Identification
%! s = parseWilkinsonFormula ("logY ~ A + B", "matrix");
%! assert (s.VariableNames{s.ResponseIdx}, "logY");
%! assert (any (strcmp ("A", s.VariableNames)));
%!test
%! ## Test : No Response Variable
%! s = parseWilkinsonFormula ("~ A + B", "matrix");
%! assert (isempty (s.ResponseIdx));
%!test
%! ## Test : Intercept Handling
%! s1 = parseWilkinsonFormula ("~ A", "matrix");
%! assert (any (all (s1.Terms == 0, 2)));
%! s2 = parseWilkinsonFormula ("~ A - 1", "matrix");
%! assert (! any (all (s2.Terms == 0, 2)));
%!test
%! ## Test : Numeric Interaction
%! d.y = [1;2;3;4;5];
%! d.X1 = [1;2;1;2;1];
%! d.X2 = [10;10;20;20;10];
%! [M, ~, ~] = parseWilkinsonFormula ("y ~ X1:X2", "model_matrix", d);
%! assert (size (M), [5, 2]);
%! assert (M(:, 2), d.X1 .* d.X2);
%!test
%! ## Test : Categorical Expansion
%! d.y = [1;1;1];
%! d.G = {"A"; "B"; "C"};
%! [M, ~, names] = parseWilkinsonFormula ("~ G", "model_matrix", d);
%! assert (size (M, 2), 3);
%! assert (names, {"(Intercept)"; "G_B"; "G_C"});
%!test
%! ## Test : Categorical * Categorical Rank
%! d.y = [1;2;3;4];
%! d.F1 = {"a";"b";"a";"b"};
%! d.F2 = {"x";"x";"y";"y"};
%! [M, ~, ~] = parseWilkinsonFormula ("~ F1 * F2", "model_matrix", d);
%! assert (size (M, 2), 4);
%! assert (rank (M), 4);
%!test
%! ## Test : Numeric * Categorical Naming
%! d.y = [1;2];
%! d.N = [10; 20];
%! d.C = {"lo"; "hi"};
%! [M, ~, names] = parseWilkinsonFormula ("~ N * C", "model_matrix", d);
%! assert (any (strcmp (names, "C_lo:N")));
%!test
%! ## Test : Intercept Only Model
%! d.y = [1; 2; 3];
%! [X, ~, names] = parseWilkinsonFormula ("y ~ 1", "model_matrix", d);
%! assert (size (X, 2), 1);
%! assert (names, {"(Intercept)"});
%! assert (all (X == 1));
%!test
%! ## Test : NaNs and Missing Data
%! d.y = [1; 2; 3; 4];
%! d.A = [1; 1; NaN; 1];
%! d.B = [10; 20; 30; NaN];
%! [X, y_out, ~] = parseWilkinsonFormula ("y ~ A", "model_matrix", d);
%! assert (length (y_out), 3);
%! assert (y_out(3), 4);
%! assert (size (X, 1), 3);
%!test
%! ## Test : Nesting with Groups
%! t = parseWilkinsonFormula ("A / (B + C)", "expand");
%! terms = cellfun (@(x) strjoin(sort(x), ":"), t, "UniformOutput", false);
%! expected = sort ({"A", "A:B", "A:C"});
%! assert (sort (terms), expected);
%!test
%! ## Test : Variable Name Collision
%! d.Var = [1; 1];
%! d.Var_1 = [2; 2];
%! [~, ~, names] = parseWilkinsonFormula ("~ Var + Var_1", "model_matrix", d);
%! assert (any (strcmp (names, "Var")));
%! assert (any (strcmp (names, "Var_1")));
%!test
%! ## Test : One-argument call
%! result = parseWilkinsonFormula ("A * B");
%! expected = sort ({"A", "B", "A:B"});
%! actual = cellfun (@(x) strjoin(sort(x), ":"), result, "UniformOutput", false);
%! assert (sort (actual), expected);
%!test
%! ## Test : Compatibility with Table Data
%! Age = [25; 30; 35; 40; 45];
%! Weight = [70; 75; 80; 85; 90];
%! BP = [120; 122; 128; 130; 135];
%! T = table (Age, Weight, BP);
%! formula = "BP ~ Age * Weight";
%! [X, y, names] = parseWilkinsonFormula (formula, "model_matrix", T);
%! assert (size (X), [5, 4]);
%! assert (y, BP);
%! assert (any (strcmp ("Age", names)));
%! assert (any (strcmp ("Weight", names)));
%! assert (names{1}, "(Intercept)");
%!test
%! ## Test : Multi-variable List
%! d.y1 = [1; 2; 3]; d.y2 = [4; 5; 6]; d.x = [1; 0; 1];
%! [X, y, ~] = parseWilkinsonFormula ("y1, y2 ~ x", "model_matrix", d);
%! assert (size (y), [3, 2]);
%! assert (y(:,1), d.y1);
%! assert (y(:,2), d.y2);
%!test
%! ## Test : multivariable range.
%! d.A = [10;20]; d.B = [30;40]; d.C = [50;60]; d.x = [1;2];
%! [X, y, ~] = parseWilkinsonFormula ("A - C ~ x", "model_matrix", d);
%! assert (size (y), [2, 3]);
%! assert (y(:,1), d.A);
%! assert (y(:,2), d.B);
%! assert (y(:,3), d.C);
%!test
%! ## Test : multivariable list + range.
%! d.y1 = [1]; d.y2 = [2]; d.y3 = [3]; d.y4 = [4]; d.y5 = [5];
%! d.x1 = [10]; d.x2 = [2];
%! [X, y, names] = parseWilkinsonFormula ("y1, y3 - y5 ~ x1:x2", "model_matrix", d);
%! expected_y = [d.y1, d.y3, d.y4, d.y5];
%! assert (isequal (y, expected_y));
%! assert (size (X, 2), 2);
%! assert (any (strcmp (names, "x1:x2")));
%!test
%! ## Test : reverse range.
%! d.A = [1]; d.B = [2]; d.C = [3]; d.x = [10];
%! [X, y, names] = parseWilkinsonFormula ("C - A ~ x - 1", "model_matrix", d);
%! assert (size (y), [1, 3]);
%! assert (y(:,1), d.A);
%! assert (y(:,3), d.C);
%! assert (size (X, 2), 1);
%! assert (! any (strcmp (names, "(Intercept)")));
%!test
%! ## Test : nans in multi-y.
%! d.yA = {1; 2; 3; 4};
%! d.yB = [10; 20; NaN; 40];
%! d.x  = [1; 1; 1; 1];
%! [X, y, ~] = parseWilkinsonFormula ("yA, yB ~ x", "model_matrix", d);
%! assert (size (y), [3, 2]);
%! assert (y(3, 1), 4);
%! assert (y(3, 2), 40);
%! assert (size (X, 1), 3);
%!error <Input formula string is required> parseWilkinsonFormula ()
%!error <Unknown mode> parseWilkinsonFormula ("y ~ x", "invalid_mode")
%!error <Unexpected End Of Formula> parseWilkinsonFormula ("", "parse")
%!error <Unexpected End Of Formula> parseWilkinsonFormula ("A +", "parse")
%!error <Unexpected End Of Formula> parseWilkinsonFormula ("A *", "parse")
%!error <Unexpected End Of Formula> parseWilkinsonFormula ("A .", "parse")
%!error <Unexpected End Of Formula> parseWilkinsonFormula ("A /", "parse")
%!error <Exponent must be a number> parseWilkinsonFormula ("(A+B)^C", "expand")
%!error <Mismatched Parentheses> parseWilkinsonFormula ("(A + B", "parse")
%!error <parseWilkinsonFormula: Unexpected token> parseWilkinsonFormula ("A + B)", "parse")
%!error <Unexpected token> parseWilkinsonFormula ("( )", "parse")
%!error <Unexpected token> parseWilkinsonFormula ("A + * B", "parse")
%!error <Unexpected token> parseWilkinsonFormula ("y ~ x ~ z", "parse")
%!error <'model_matrix' mode requires a Data Table> parseWilkinsonFormula ("~ A", "model_matrix")
%!error <Unknown variable> d.x=1; parseWilkinsonFormula ("~ Z", "model_matrix", d)
%!error <Response variable 'Z' not found in Data> d.x=1; d.y=1; parseWilkinsonFormula ("Z ~ x", "model_matrix", d)
%!error <Unknown variable 'A' in range> d.x=1; d.y=1; parseWilkinsonFormula ("A - y ~ x", "model_matrix", d)
%!error <Unknown variable 'B' in range> d.x=1; d.y=1; parseWilkinsonFormula ("y - B ~ x", "model_matrix", d)
%!error <Invalid syntax in response term> d.y=1; parseWilkinsonFormula ("y - y - y ~ x", "model_matrix", d)
%!error <Response variable 'S' must be numeric> d.S={"a";"b"}; d.x=[1;2]; parseWilkinsonFormula ("S ~ x", "model_matrix", d)
%!error <Data must be a table or struct> parseWilkinsonFormula ("y ~ x", "model_matrix", [1,2,3])
