Source code for trancit.models.bic_selection

"""
BIC (Bayesian Information Criterion) model selection.

This module provides BIC-based model order selection for VAR models.
"""

import logging
from typing import Dict, Tuple

import numpy as np

from trancit import PipelineConfig
from trancit.utils import compute_event_statistics

from ..core.exceptions import ValidationError

logger = logging.getLogger(__name__)



[docs]
class BICSelector:
    """
    Bayesian Information Criterion (BIC) selector for VAR model order selection.

    This class provides methods for computing BIC values and selecting optimal
    model orders for Vector Autoregression (VAR) models.
    """


[docs]
    def __init__(self, config: PipelineConfig):
        """
        Initialize BIC selector.

        Parameters
        ----------
        config : PipelineConfig
            Pipeline configuration
        """
        self.config = config
        self._validate_parameters()


    def _validate_parameters(self) -> None:
        """Validate initialization parameters."""
        if self.config.bic.momax <= 0:
            raise ValidationError(
                "max_order must be positive", "max_order", self.config.bic.momax
            )
        if self.config.bic.mode not in ["biased", "debiased"]:
            raise ValidationError(
                "mode must be 'biased' or 'debiased'", "mode", self.config.bic.mode
            )

    def _compute_multi_trial_bic(self, event_data_max_order: np.ndarray) -> Dict:
        """
        Calculate Bayesian Information Criterion (BIC) for multiple model orders
        across trial data.

        This function computes BIC values for Vector Autoregression (VAR) models
        of orders 1 to momax,
        supporting model selection with multiple BIC variants.

        Parameters
        ----------
        event_data_max_order : np.ndarray
            Time series data for the maximum model order,
            shape (n_vars * (max_order + 1), n_observations, n_trials).
        bic_params : dict
            Parameters for BIC calculation, containing:
                - "Params": Sub-dict with "BIC": {"momax": int}, the maximum
                  model order.

        Returns
        -------
        dict
            Dictionary containing:
                - 'bic': BIC values for each model order and variant, shape
                  (max_order, 4).
                - 'penalty_terms': Penalty terms for BIC, shape (max_order, 4).
                - 'log_likelihood': Log-likelihood for each model order, shape
                  (max_order,).
                - 'sum_log_det_hessian': Sum of log determinant of Hessian,
                  shape (max_order,).
                - 'optimal_orders': Optimal model orders for each BIC variant,
                  shape (4,).

        Raises
        ------
        ValueError
            If input data shape is inconsistent with expected dimensions.
        """
        max_order = self.config.bic.momax
        print(f"max_order: {max_order}")
        total_var_lag, n_observations, n_trials = event_data_max_order.shape
        n_vars = total_var_lag // (max_order + 1)

        if total_var_lag != n_vars * (max_order + 1):
            raise ValueError(
                "Data shape does not match expected dimensions based on " "max_order."
            )

        bic_outputs = {
            "bic": np.full((max_order, 4), np.nan),
            "pt_bic": np.full((max_order, 4), np.nan),
            "log_likelihood": np.full(max_order, np.nan),
            "sum_log_det_hessian": np.full(max_order, np.nan),
            "optimal_orders": None,
        }

        for model_order in range(1, max_order + 1):
            logger.info(f"Calculating BIC for model order: {model_order}")
            data_subset = event_data_max_order[: n_vars * (model_order + 1), :, :]

            log_likelihood, sum_log_det_hessian = self._compute_bic_for_model(
                data_subset, model_order, self.config
            )
            bic_outputs["log_likelihood"][model_order - 1] = log_likelihood
            bic_outputs["sum_log_det_hessian"][model_order - 1] = sum_log_det_hessian

            bic_outputs["pt_bic"][model_order - 1, 0] = (
                0.5 * n_observations * model_order * n_vars * n_vars * np.log(n_trials)
            )
            bic_outputs["pt_bic"][model_order - 1, 1] = 0.5 * sum_log_det_hessian
            bic_outputs["pt_bic"][model_order - 1, 2] = (
                0.5
                * n_observations
                * model_order
                * n_vars
                * n_vars
                * np.log(n_trials * n_observations)
            )
            bic_outputs["pt_bic"][model_order - 1, 3] = (
                0.5 * model_order * n_vars * n_vars * np.log(n_trials * n_observations)
            )

            for variant in range(4):
                bic_outputs["bic"][model_order - 1, variant] = (
                    -bic_outputs["log_likelihood"][model_order - 1] * n_trials
                    + bic_outputs["pt_bic"][model_order - 1, variant]
                )

        optimal_orders = np.nanargmin(bic_outputs["bic"], axis=0) + 1
        bic_outputs["mobic"] = optimal_orders
        logger.info(f"Optimal model orders for BIC variants: {optimal_orders}")

        return bic_outputs

    def _compute_bic_for_model(
        self, event_data: np.ndarray, model_order: int, config: PipelineConfig
    ) -> Tuple[float, float]:
        """
        Compute log-likelihood and sum of log determinant of Hessian for a
        specific model order.

        Supports 'biased' mode currently; 'debiased' mode is planned for
        future implementation.

        Parameters
        ----------
        event_data : np.ndarray
            Event data for the model order,
            shape (n_vars * (model_order + 1), n_observations, n_trials).
        model_order : int
            The VAR model order to evaluate.
        bic_params : dict
            BIC parameters including:
                - 'Params': Sub-dict with 'BIC': {'mode': str}, e.g., 'biased'.
                - 'EstimMode': Estimation mode, either 'OLS' or 'RLS'.

        Returns
        -------
        Tuple[float, float]
            - log_likelihood: The log-likelihood of the model.
            - sum_log_det_hessian: Sum of the log determinant of the Hessian.

        Raises
        ------
        ValueError
            If 'mode' or 'EstimMode' is invalid.
        """
        total_var_lag, n_observations, n_trials = event_data.shape
        n_vars = total_var_lag // (model_order + 1)

        mode = config.bic.mode
        if mode not in ["biased"]:
            raise ValueError(
                f"Unsupported mode '{mode}'; only 'biased' is implemented."
            )

        if mode == "biased":
            stats = compute_event_statistics(event_data, model_order)
        else:
            raise NotImplementedError("'debiased' mode is not yet supported.")

        log_det_hessian = np.zeros(n_observations)
        residual_determinants = np.zeros(n_observations)

        estim_mode = config.bic.estim_mode
        if estim_mode not in ["OLS", "RLS"]:
            raise ValueError(
                f"Invalid EstimMode '{estim_mode}'; must be 'OLS' or 'RLS'."
            )

        for t in range(n_observations):
            covariance_current = stats["Sigma"][t, n_vars:, n_vars:]
            residual_cov = stats[estim_mode]["Sigma_Et"][t]
            residual_determinants[t] = np.prod(np.diag(residual_cov))
            log_det_hessian[t] = (
                model_order * n_vars**2 * np.log(n_trials)
                + n_vars * np.log(np.linalg.det(covariance_current))
                - n_vars * model_order * np.log(residual_determinants[t] or 1e-8)
            )

        log_likelihood = (
            -0.5 * n_observations * n_vars * np.log(2 * np.pi)
            - 0.5 * np.sum(np.log(residual_determinants + 1e-8))
            - 0.5 * n_observations * n_vars
        )
        sum_log_det_hessian = np.sum(log_det_hessian)

        logger.debug(
            f"Model order {model_order}: log_likelihood={log_likelihood:.4f}, "
            f"sum_log_det_hessian={sum_log_det_hessian:.4f}"
        )

        return log_likelihood, sum_log_det_hessian