{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Prediction interval calculation by ``bayesml.metatree`` (accepted at AISTATS 2025)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Installation of BayesML\n", "\n", "Installation from PyPI will be available soon. Until then, please follow these steps:\n", "\n", "1. Clone BayesML from https://github.com/bayesml/BayesML.\n", "2. Change the directory and move to the root of the cloned BayesML folder where setup.py exists. (e.g., /Users/UserName/Documents/GitHub/BayesML)\n", "3. Run `pip install .`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexbmibps1s2s3s4s5s6targetintercept
00.0380760.0506800.0616960.021872-0.044223-0.034821-0.043401-0.0025920.019907-0.017646151.01.0
1-0.001882-0.044642-0.051474-0.026328-0.008449-0.0191630.074412-0.039493-0.068332-0.09220475.01.0
20.0852990.0506800.044451-0.005670-0.045599-0.034194-0.032356-0.0025920.002861-0.025930141.01.0
3-0.089063-0.044642-0.011595-0.0366560.0121910.024991-0.0360380.0343090.022688-0.009362206.01.0
40.005383-0.044642-0.0363850.0218720.0039350.0155960.008142-0.002592-0.031988-0.046641135.01.0
\n", "
" ], "text/plain": [ " age sex bmi bp s1 s2 s3 \\\n", "0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 \n", "1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 \n", "2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 \n", "3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 \n", "4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 \n", "\n", " s4 s5 s6 target intercept \n", "0 -0.002592 0.019907 -0.017646 151.0 1.0 \n", "1 -0.039493 -0.068332 -0.092204 75.0 1.0 \n", "2 -0.002592 0.002861 -0.025930 141.0 1.0 \n", "3 0.034309 0.022688 -0.009362 206.0 1.0 \n", "4 -0.002592 -0.031988 -0.046641 135.0 1.0 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "from sklearn.datasets import load_diabetes\n", "diabetes = load_diabetes(as_frame=True)\n", "df = diabetes.frame\n", "df['intercept'] = 1.0 # Add intercept column\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "x_continuous = df[[\"age\",\"bmi\",\"bp\",\"s1\",\"s2\",\"s3\",\"s4\",\"s5\",\"s6\",\"intercept\"]].to_numpy()\n", "x_categorical = np.where(df[['sex']]>0, 1, 0) # Convert to binary\n", "y = df['target'].to_numpy()\n", "\n", "(x_train_continuous,x_test_continuous,\n", " x_train_categorical,x_test_categorical,\n", " y_train,y_test) = train_test_split(x_continuous,\n", " x_categorical,\n", " y,\n", " test_size=0.5,random_state=0)\n", "\n", "scaler = StandardScaler()\n", "x_train_continuous[:,:-1] = scaler.fit_transform(x_train_continuous[:,:-1])\n", "x_test_continuous[:,:-1] = scaler.transform(x_test_continuous[:,:-1])\n", "y_train = scaler.fit_transform(y_train[:,np.newaxis])[:,0]\n", "y_test = scaler.transform(y_test[:,np.newaxis])[:,0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create model instance\n", "\n", "First, calculate model constants." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# feature dimensions\n", "continuous_dim = x_train_continuous.shape[1]\n", "categorical_dim = x_train_categorical.shape[1]\n", "\n", "# ranges for continuous features\n", "# these ranges will be recursively bisected\n", "ranges = np.empty([continuous_dim,2])\n", "ranges[:,0] = x_train_continuous.min(axis=0)\n", "ranges[:,1] = x_train_continuous.max(axis=0)\n", "\n", "# number of assignments for each feature on a path\n", "# -1 means that the feature can be assigned any number of times\n", "num_assignment_vec = -np.ones(continuous_dim+categorical_dim,dtype=int)\n", "num_assignment_vec[continuous_dim-1] = 0 # intercept term is never assigned" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, create a model instance based on the model constants." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from bayesml import metatree,linearregression\n", "\n", "# create model\n", "model = metatree.LearnModel(\n", " c_dim_continuous=continuous_dim,\n", " c_dim_categorical=categorical_dim,\n", " c_max_depth=10,\n", " c_num_assignment_vec=num_assignment_vec,\n", " c_ranges=ranges,\n", " h0_g=0.75,\n", " SubModel=linearregression, # leaf model is linear regression\n", " sub_constants={'c_degree':continuous_dim}, # linear regression degree\n", " sub_h0_params={'h0_mu_vec':np.zeros(continuous_dim), # hyperparameters for normal-gamma prior\n", " 'h0_lambda_mat':np.eye(continuous_dim),\n", " 'h0_alpha':2.1,\n", " 'h0_beta':1.0},\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calculate posterior distribution" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "brun_in: 100\n", "99\n", "num_metatrees: 500\n", "499\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.update_posterior(\n", " x_train_continuous,\n", " x_train_categorical,\n", " y_train,\n", " alg_type='REMTMCMC',\n", " burn_in=100,\n", " num_metatrees=500,\n", " threshold_type='sample_midpoint',\n", " seed=0,\n", " g_max=0.9,\n", " num_chains=8,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calculate prediction interval\n", "\n", "First, define subroutines to calculate ``100(1-alpha)``% prediction interval from the values of the probability density function of the predictive distribution. The initial range is determined by the mean and the variance of the predictive distribution." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from scipy.integrate import simpson\n", "from scipy.optimize import bisect\n", "\n", "def calc_prediction_interval_sub(k,x,y,alpha):\n", " return simpson(y[y>=k],x=x[:np.count_nonzero(y>=k)])-(1-alpha)\n", "\n", "def calc_prediction_interval(model,alpha,num_steps):\n", " mean = model.make_prediction(loss='squared')\n", " sd = np.sqrt(model.calc_pred_var())\n", " l = mean - sd/np.sqrt(alpha) # Lower bound by Chebyshev's inequality\n", " r = mean + sd/np.sqrt(alpha) # Upper bound by Chebyshev's inequality\n", " x = np.linspace(l,r,num_steps)\n", " y = model.calc_pred_density(x)\n", " k_max = y.max()\n", " k = bisect(calc_prediction_interval_sub,0,k_max,(x,y,alpha))\n", " l_list = []\n", " r_list = []\n", " for i in range(len(x)-1):\n", " if y[i] < k and k <= y[i+1]:\n", " l_list.append(x[i+1])\n", " if y[i] >= k and k > y[i+1]:\n", " r_list.append(x[i])\n", " \n", " return l_list,r_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculate prediction interval by using the above subroutines." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "220/220\r" ] } ], "source": [ "pred_interval = []\n", "for i in range(y_test.shape[0]):\n", " # calculate predictive distribution for the i-th test data\n", " model.calc_pred_dist(\n", " x_test_continuous[i],\n", " x_test_categorical[i],\n", " )\n", " # calculate pretiction interval\n", " pred_interval.append(calc_prediction_interval(model,0.05,1000))\n", " print(f'{i}/{y_test.shape[0]-1}',end='\\r')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Evaluation\n", "\n", "First, define subroutines for evaluation." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Calculate frequency that the true value is in the prediction interval\n", "def calc_frequency(y,pred_intervals):\n", " tmp_frequency = 0\n", " for i,interval in enumerate(pred_intervals):\n", " for j in range(len(interval[0])):\n", " if interval[0][j] < y[i] and y[i] < interval[1][j]:\n", " tmp_frequency += 1\n", " \n", " return tmp_frequency/len(pred_intervals)\n", "\n", "# Calculate length of the prediction interval\n", "def calc_length(pred_intervals):\n", " tmp_length = 0\n", " for interval in pred_intervals:\n", " for i in range(len(interval[0])):\n", " tmp_length += (interval[1][i]-interval[0][i])\n", " \n", " return tmp_length/len(pred_intervals)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Evaluate the results. Note that frequency close to ``100(1-alpha)``% (here, 0.95) and short length are desireble." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Avg. frequency: 0.9366515837104072\n", "Avg. length: 2.700600606247248\n" ] } ], "source": [ "print(f'Avg. frequency: {calc_frequency(y_test,pred_interval)}')\n", "print(f'Avg. length: {calc_length(pred_interval)}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calculate prediction interval by sampling\n", "\n", "Using metatree.GenModel, prediction intervals can be calculated from a sample from the posterior predictive distribution." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_model = metatree.GenModel(\n", " c_dim_continuous=x_train_continuous.shape[1],\n", " c_dim_categorical=x_train_categorical.shape[1],\n", " c_max_depth=10,\n", " c_num_assignment_vec=num_assignment_vec,\n", " c_ranges=ranges,\n", " h_g=0.75,\n", " SubModel=linearregression,\n", " sub_constants={'c_degree':x_train_continuous.shape[1]},\n", " sub_h_params={'h_mu_vec':np.zeros(continuous_dim),\n", " 'h_lambda_mat':np.eye(continuous_dim),\n", " 'h_alpha':2.1,\n", " 'h_beta':1.0},\n", " seed=0,\n", ")\n", "\n", "# set hyperparameters for the generative model as those of the learned model\n", "gen_model.set_h_params(\n", " h_metatree_list=model.hn_metatree_list,\n", " h_metatree_prob_vec=model.hn_metatree_prob_vec,\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "220/220\r" ] } ], "source": [ "from arviz import hdi\n", "\n", "sample_size = 500\n", "y_pred_sample = np.zeros(sample_size)\n", "pred_interval = []\n", "\n", "for i in range(y_test.shape[0]):\n", " for t in range(sample_size):\n", " # generate parameters from the posterior distribution\n", " gen_model.gen_params()\n", " # generate a sample according to the generated parameters\n", " _,_,y_pred_sample[t] = gen_model.gen_sample(\n", " x_continuous=x_test_continuous[i],\n", " x_categorical=x_test_categorical[i],\n", " )\n", " pred_interval.append(hdi(y_pred_sample,0.95,multimodal=True).T)\n", " print(f'{i}/{y_test.shape[0]-1}',end='\\r')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Avg. frequency: 0.9638009049773756\n", "Avg. length: 2.8378308334394333\n" ] } ], "source": [ "print(f'Avg. frequency: {calc_frequency(y_test,pred_interval)}')\n", "print(f'Avg. length: {calc_length(pred_interval)}')" ] } ], "metadata": { "kernelspec": { "display_name": "bayesml_dev", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 2 }