{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prediction interval calculation by ``bayesml.metatree`` (accepted at AISTATS 2025)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Installation of BayesML\n",
    "\n",
    "Installation from PyPI will be available soon. Until then, please follow these steps:\n",
    "\n",
    "1. Clone BayesML from https://github.com/bayesml/BayesML.\n",
    "2. Change the directory and move to the root of the cloned BayesML folder where setup.py exists. (e.g., /Users/UserName/Documents/GitHub/BayesML)\n",
    "3. Run `pip install .`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>bp</th>\n",
       "      <th>s1</th>\n",
       "      <th>s2</th>\n",
       "      <th>s3</th>\n",
       "      <th>s4</th>\n",
       "      <th>s5</th>\n",
       "      <th>s6</th>\n",
       "      <th>target</th>\n",
       "      <th>intercept</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.038076</td>\n",
       "      <td>0.050680</td>\n",
       "      <td>0.061696</td>\n",
       "      <td>0.021872</td>\n",
       "      <td>-0.044223</td>\n",
       "      <td>-0.034821</td>\n",
       "      <td>-0.043401</td>\n",
       "      <td>-0.002592</td>\n",
       "      <td>0.019907</td>\n",
       "      <td>-0.017646</td>\n",
       "      <td>151.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.001882</td>\n",
       "      <td>-0.044642</td>\n",
       "      <td>-0.051474</td>\n",
       "      <td>-0.026328</td>\n",
       "      <td>-0.008449</td>\n",
       "      <td>-0.019163</td>\n",
       "      <td>0.074412</td>\n",
       "      <td>-0.039493</td>\n",
       "      <td>-0.068332</td>\n",
       "      <td>-0.092204</td>\n",
       "      <td>75.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.085299</td>\n",
       "      <td>0.050680</td>\n",
       "      <td>0.044451</td>\n",
       "      <td>-0.005670</td>\n",
       "      <td>-0.045599</td>\n",
       "      <td>-0.034194</td>\n",
       "      <td>-0.032356</td>\n",
       "      <td>-0.002592</td>\n",
       "      <td>0.002861</td>\n",
       "      <td>-0.025930</td>\n",
       "      <td>141.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.089063</td>\n",
       "      <td>-0.044642</td>\n",
       "      <td>-0.011595</td>\n",
       "      <td>-0.036656</td>\n",
       "      <td>0.012191</td>\n",
       "      <td>0.024991</td>\n",
       "      <td>-0.036038</td>\n",
       "      <td>0.034309</td>\n",
       "      <td>0.022688</td>\n",
       "      <td>-0.009362</td>\n",
       "      <td>206.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.005383</td>\n",
       "      <td>-0.044642</td>\n",
       "      <td>-0.036385</td>\n",
       "      <td>0.021872</td>\n",
       "      <td>0.003935</td>\n",
       "      <td>0.015596</td>\n",
       "      <td>0.008142</td>\n",
       "      <td>-0.002592</td>\n",
       "      <td>-0.031988</td>\n",
       "      <td>-0.046641</td>\n",
       "      <td>135.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        age       sex       bmi        bp        s1        s2        s3  \\\n",
       "0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   \n",
       "1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   \n",
       "2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   \n",
       "3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   \n",
       "4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   \n",
       "\n",
       "         s4        s5        s6  target  intercept  \n",
       "0 -0.002592  0.019907 -0.017646   151.0        1.0  \n",
       "1 -0.039493 -0.068332 -0.092204    75.0        1.0  \n",
       "2 -0.002592  0.002861 -0.025930   141.0        1.0  \n",
       "3  0.034309  0.022688 -0.009362   206.0        1.0  \n",
       "4 -0.002592 -0.031988 -0.046641   135.0        1.0  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.datasets import load_diabetes\n",
    "diabetes = load_diabetes(as_frame=True)\n",
    "df = diabetes.frame\n",
    "df['intercept'] = 1.0 # Add intercept column\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "x_continuous = df[[\"age\",\"bmi\",\"bp\",\"s1\",\"s2\",\"s3\",\"s4\",\"s5\",\"s6\",\"intercept\"]].to_numpy()\n",
    "x_categorical = np.where(df[['sex']]>0, 1, 0) # Convert to binary\n",
    "y = df['target'].to_numpy()\n",
    "\n",
    "(x_train_continuous,x_test_continuous,\n",
    " x_train_categorical,x_test_categorical,\n",
    " y_train,y_test) = train_test_split(x_continuous,\n",
    "                                    x_categorical,\n",
    "                                    y,\n",
    "                                    test_size=0.5,random_state=0)\n",
    "\n",
    "scaler = StandardScaler()\n",
    "x_train_continuous[:,:-1] = scaler.fit_transform(x_train_continuous[:,:-1])\n",
    "x_test_continuous[:,:-1] = scaler.transform(x_test_continuous[:,:-1])\n",
    "y_train = scaler.fit_transform(y_train[:,np.newaxis])[:,0]\n",
    "y_test = scaler.transform(y_test[:,np.newaxis])[:,0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create model instance\n",
    "\n",
    "First, calculate model constants."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# feature dimensions\n",
    "continuous_dim = x_train_continuous.shape[1]\n",
    "categorical_dim = x_train_categorical.shape[1]\n",
    "\n",
    "# ranges for continuous features\n",
    "# these ranges will be recursively bisected\n",
    "ranges = np.empty([continuous_dim,2])\n",
    "ranges[:,0] = x_train_continuous.min(axis=0)\n",
    "ranges[:,1] = x_train_continuous.max(axis=0)\n",
    "\n",
    "# number of assignments for each feature on a path\n",
    "# -1 means that the feature can be assigned any number of times\n",
    "num_assignment_vec = -np.ones(continuous_dim+categorical_dim,dtype=int)\n",
    "num_assignment_vec[continuous_dim-1] = 0 # intercept term is never assigned"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, create a model instance based on the model constants."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bayesml import metatree,linearregression\n",
    "\n",
    "# create model\n",
    "model = metatree.LearnModel(\n",
    "    c_dim_continuous=continuous_dim,\n",
    "    c_dim_categorical=categorical_dim,\n",
    "    c_max_depth=10,\n",
    "    c_num_assignment_vec=num_assignment_vec,\n",
    "    c_ranges=ranges,\n",
    "    h0_g=0.75,\n",
    "    SubModel=linearregression, # leaf model is linear regression\n",
    "    sub_constants={'c_degree':continuous_dim}, # linear regression degree\n",
    "    sub_h0_params={'h0_mu_vec':np.zeros(continuous_dim), # hyperparameters for normal-gamma prior\n",
    "                   'h0_lambda_mat':np.eye(continuous_dim),\n",
    "                   'h0_alpha':2.1,\n",
    "                   'h0_beta':1.0},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate posterior distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "brun_in: 100\n",
      "99\n",
      "num_metatrees: 500\n",
      "499\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<bayesml.metatree._metatree.LearnModel at 0x14f49d0d0>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.update_posterior(\n",
    "    x_train_continuous,\n",
    "    x_train_categorical,\n",
    "    y_train,\n",
    "    alg_type='REMTMCMC',\n",
    "    burn_in=100,\n",
    "    num_metatrees=500,\n",
    "    threshold_type='sample_midpoint',\n",
    "    seed=0,\n",
    "    g_max=0.9,\n",
    "    num_chains=8,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate prediction interval\n",
    "\n",
    "First, define subroutines to calculate ``100(1-alpha)``% prediction interval from the values of the probability density function of the predictive distribution. The initial range is determined by the mean and the variance of the predictive distribution."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.integrate import simpson\n",
    "from scipy.optimize import bisect\n",
    "\n",
    "def calc_prediction_interval_sub(k,x,y,alpha):\n",
    "    return simpson(y[y>=k],x=x[:np.count_nonzero(y>=k)])-(1-alpha)\n",
    "\n",
    "def calc_prediction_interval(model,alpha,num_steps):\n",
    "    mean = model.make_prediction(loss='squared')\n",
    "    sd = np.sqrt(model.calc_pred_var())\n",
    "    l = mean - sd/np.sqrt(alpha) # Lower bound by Chebyshev's inequality\n",
    "    r = mean + sd/np.sqrt(alpha) # Upper bound by Chebyshev's inequality\n",
    "    x = np.linspace(l,r,num_steps)\n",
    "    y = model.calc_pred_density(x)\n",
    "    k_max = y.max()\n",
    "    k = bisect(calc_prediction_interval_sub,0,k_max,(x,y,alpha))\n",
    "    l_list = []\n",
    "    r_list = []\n",
    "    for i in range(len(x)-1):\n",
    "        if y[i] < k and k <= y[i+1]:\n",
    "            l_list.append(x[i+1])\n",
    "        if y[i] >= k and k > y[i+1]:\n",
    "            r_list.append(x[i])\n",
    "    \n",
    "    return l_list,r_list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate prediction interval by using the above subroutines."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "220/220\r"
     ]
    }
   ],
   "source": [
    "pred_interval = []\n",
    "for i in range(y_test.shape[0]):\n",
    "    # calculate predictive distribution for the i-th test data\n",
    "    model.calc_pred_dist(\n",
    "        x_test_continuous[i],\n",
    "        x_test_categorical[i],\n",
    "    )\n",
    "    # calculate pretiction interval\n",
    "    pred_interval.append(calc_prediction_interval(model,0.05,1000))\n",
    "    print(f'{i}/{y_test.shape[0]-1}',end='\\r')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluation\n",
    "\n",
    "First, define subroutines for evaluation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate frequency that the true value is in the prediction interval\n",
    "def calc_frequency(y,pred_intervals):\n",
    "    tmp_frequency = 0\n",
    "    for i,interval in enumerate(pred_intervals):\n",
    "        for j in range(len(interval[0])):\n",
    "            if interval[0][j] < y[i] and y[i] < interval[1][j]:\n",
    "                tmp_frequency += 1\n",
    "    \n",
    "    return tmp_frequency/len(pred_intervals)\n",
    "\n",
    "# Calculate length of the prediction interval\n",
    "def calc_length(pred_intervals):\n",
    "    tmp_length = 0\n",
    "    for interval in pred_intervals:\n",
    "        for i in range(len(interval[0])):\n",
    "            tmp_length += (interval[1][i]-interval[0][i])\n",
    "    \n",
    "    return tmp_length/len(pred_intervals)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Evaluate the results. Note that frequency close to ``100(1-alpha)``% (here, 0.95) and short length are desireble."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg. frequency: 0.9366515837104072\n",
      "Avg. length: 2.700600606247248\n"
     ]
    }
   ],
   "source": [
    "print(f'Avg. frequency: {calc_frequency(y_test,pred_interval)}')\n",
    "print(f'Avg. length: {calc_length(pred_interval)}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate prediction interval by sampling\n",
    "\n",
    "Using metatree.GenModel, prediction intervals can be calculated from a sample from the posterior predictive distribution."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<bayesml.metatree._metatree.GenModel at 0x16b81c610>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gen_model = metatree.GenModel(\n",
    "    c_dim_continuous=x_train_continuous.shape[1],\n",
    "    c_dim_categorical=x_train_categorical.shape[1],\n",
    "    c_max_depth=10,\n",
    "    c_num_assignment_vec=num_assignment_vec,\n",
    "    c_ranges=ranges,\n",
    "    h_g=0.75,\n",
    "    SubModel=linearregression,\n",
    "    sub_constants={'c_degree':x_train_continuous.shape[1]},\n",
    "    sub_h_params={'h_mu_vec':np.zeros(continuous_dim),\n",
    "                   'h_lambda_mat':np.eye(continuous_dim),\n",
    "                   'h_alpha':2.1,\n",
    "                   'h_beta':1.0},\n",
    "    seed=0,\n",
    ")\n",
    "\n",
    "# set hyperparameters for the generative model as those of the learned model\n",
    "gen_model.set_h_params(\n",
    "    h_metatree_list=model.hn_metatree_list,\n",
    "    h_metatree_prob_vec=model.hn_metatree_prob_vec,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "220/220\r"
     ]
    }
   ],
   "source": [
    "from arviz import hdi\n",
    "\n",
    "sample_size = 500\n",
    "y_pred_sample = np.zeros(sample_size)\n",
    "pred_interval = []\n",
    "\n",
    "for i in range(y_test.shape[0]):\n",
    "    for t in range(sample_size):\n",
    "        # generate parameters from the posterior distribution\n",
    "        gen_model.gen_params()\n",
    "        # generate a sample according to the generated parameters\n",
    "        _,_,y_pred_sample[t] = gen_model.gen_sample(\n",
    "            x_continuous=x_test_continuous[i],\n",
    "            x_categorical=x_test_categorical[i],\n",
    "        )\n",
    "    pred_interval.append(hdi(y_pred_sample,0.95,multimodal=True).T)\n",
    "    print(f'{i}/{y_test.shape[0]-1}',end='\\r')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg. frequency: 0.9638009049773756\n",
      "Avg. length: 2.8378308334394333\n"
     ]
    }
   ],
   "source": [
    "print(f'Avg. frequency: {calc_frequency(y_test,pred_interval)}')\n",
    "print(f'Avg. length: {calc_length(pred_interval)}')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "bayesml_dev",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}