Prediction interval calculation by bayesml.metatree
(accepted at AISTATS 2025)#
Installation of BayesML#
Installation from PyPI will be available soon. Until then, please follow these steps:
Clone BayesML from bayesml/BayesML.
Change the directory and move to the root of the cloned BayesML folder where setup.py exists. (e.g., /Users/UserName/Documents/GitHub/BayesML)
Run
pip install .
Load dataset#
[1]:
import pandas as pd
from sklearn.datasets import load_diabetes
diabetes = load_diabetes(as_frame=True)
df = diabetes.frame
df['intercept'] = 1.0 # Add intercept column
df.head()
[1]:
age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | intercept | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019907 | -0.017646 | 151.0 | 1.0 |
1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068332 | -0.092204 | 75.0 | 1.0 |
2 | 0.085299 | 0.050680 | 0.044451 | -0.005670 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002861 | -0.025930 | 141.0 | 1.0 |
3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022688 | -0.009362 | 206.0 | 1.0 |
4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031988 | -0.046641 | 135.0 | 1.0 |
Preprocessing#
[2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
x_continuous = df[["age","bmi","bp","s1","s2","s3","s4","s5","s6","intercept"]].to_numpy()
x_categorical = np.where(df[['sex']]>0, 1, 0) # Convert to binary
y = df['target'].to_numpy()
(x_train_continuous,x_test_continuous,
x_train_categorical,x_test_categorical,
y_train,y_test) = train_test_split(x_continuous,
x_categorical,
y,
test_size=0.5,random_state=0)
scaler = StandardScaler()
x_train_continuous[:,:-1] = scaler.fit_transform(x_train_continuous[:,:-1])
x_test_continuous[:,:-1] = scaler.transform(x_test_continuous[:,:-1])
y_train = scaler.fit_transform(y_train[:,np.newaxis])[:,0]
y_test = scaler.transform(y_test[:,np.newaxis])[:,0]
Create model instance#
First, calculate model constants.
[3]:
# feature dimensions
continuous_dim = x_train_continuous.shape[1]
categorical_dim = x_train_categorical.shape[1]
# ranges for continuous features
# these ranges will be recursively bisected
ranges = np.empty([continuous_dim,2])
ranges[:,0] = x_train_continuous.min(axis=0)
ranges[:,1] = x_train_continuous.max(axis=0)
# number of assignments for each feature on a path
# -1 means that the feature can be assigned any number of times
num_assignment_vec = -np.ones(continuous_dim+categorical_dim,dtype=int)
num_assignment_vec[continuous_dim-1] = 0 # intercept term is never assigned
Then, create a model instance based on the model constants.
[4]:
from bayesml import metatree,linearregression
# create model
model = metatree.LearnModel(
c_dim_continuous=continuous_dim,
c_dim_categorical=categorical_dim,
c_max_depth=10,
c_num_assignment_vec=num_assignment_vec,
c_ranges=ranges,
h0_g=0.75,
SubModel=linearregression, # leaf model is linear regression
sub_constants={'c_degree':continuous_dim}, # linear regression degree
sub_h0_params={'h0_mu_vec':np.zeros(continuous_dim), # hyperparameters for normal-gamma prior
'h0_lambda_mat':np.eye(continuous_dim),
'h0_alpha':2.1,
'h0_beta':1.0},
)
Calculate posterior distribution#
[5]:
model.update_posterior(
x_train_continuous,
x_train_categorical,
y_train,
alg_type='REMTMCMC',
burn_in=100,
num_metatrees=500,
threshold_type='sample_midpoint',
seed=0,
g_max=0.9,
num_chains=8,
)
brun_in: 100
99
num_metatrees: 500
499
[5]:
<bayesml.metatree._metatree.LearnModel at 0x14f49d0d0>
Calculate prediction interval#
First, define subroutines to calculate 100(1-alpha)
% prediction interval from the values of the probability density function of the predictive distribution. The initial range is determined by the mean and the variance of the predictive distribution.
[6]:
from scipy.integrate import simpson
from scipy.optimize import bisect
def calc_prediction_interval_sub(k,x,y,alpha):
return simpson(y[y>=k],x=x[:np.count_nonzero(y>=k)])-(1-alpha)
def calc_prediction_interval(model,alpha,num_steps):
mean = model.make_prediction(loss='squared')
sd = np.sqrt(model.calc_pred_var())
l = mean - sd/np.sqrt(alpha) # Lower bound by Chebyshev's inequality
r = mean + sd/np.sqrt(alpha) # Upper bound by Chebyshev's inequality
x = np.linspace(l,r,num_steps)
y = model.calc_pred_density(x)
k_max = y.max()
k = bisect(calc_prediction_interval_sub,0,k_max,(x,y,alpha))
l_list = []
r_list = []
for i in range(len(x)-1):
if y[i] < k and k <= y[i+1]:
l_list.append(x[i+1])
if y[i] >= k and k > y[i+1]:
r_list.append(x[i])
return l_list,r_list
Calculate prediction interval by using the above subroutines.
[7]:
pred_interval = []
for i in range(y_test.shape[0]):
# calculate predictive distribution for the i-th test data
model.calc_pred_dist(
x_test_continuous[i],
x_test_categorical[i],
)
# calculate pretiction interval
pred_interval.append(calc_prediction_interval(model,0.05,1000))
print(f'{i}/{y_test.shape[0]-1}',end='\r')
220/220
Evaluation#
First, define subroutines for evaluation.
[8]:
# Calculate frequency that the true value is in the prediction interval
def calc_frequency(y,pred_intervals):
tmp_frequency = 0
for i,interval in enumerate(pred_intervals):
for j in range(len(interval[0])):
if interval[0][j] < y[i] and y[i] < interval[1][j]:
tmp_frequency += 1
return tmp_frequency/len(pred_intervals)
# Calculate length of the prediction interval
def calc_length(pred_intervals):
tmp_length = 0
for interval in pred_intervals:
for i in range(len(interval[0])):
tmp_length += (interval[1][i]-interval[0][i])
return tmp_length/len(pred_intervals)
Evaluate the results. Note that frequency close to 100(1-alpha)
% (here, 0.95) and short length are desireble.
[9]:
print(f'Avg. frequency: {calc_frequency(y_test,pred_interval)}')
print(f'Avg. length: {calc_length(pred_interval)}')
Avg. frequency: 0.9366515837104072
Avg. length: 2.700600606247248
Calculate prediction interval by sampling#
Using metatree.GenModel, prediction intervals can be calculated from a sample from the posterior predictive distribution.
[10]:
gen_model = metatree.GenModel(
c_dim_continuous=x_train_continuous.shape[1],
c_dim_categorical=x_train_categorical.shape[1],
c_max_depth=10,
c_num_assignment_vec=num_assignment_vec,
c_ranges=ranges,
h_g=0.75,
SubModel=linearregression,
sub_constants={'c_degree':x_train_continuous.shape[1]},
sub_h_params={'h_mu_vec':np.zeros(continuous_dim),
'h_lambda_mat':np.eye(continuous_dim),
'h_alpha':2.1,
'h_beta':1.0},
seed=0,
)
# set hyperparameters for the generative model as those of the learned model
gen_model.set_h_params(
h_metatree_list=model.hn_metatree_list,
h_metatree_prob_vec=model.hn_metatree_prob_vec,
)
[10]:
<bayesml.metatree._metatree.GenModel at 0x16b81c610>
[11]:
from arviz import hdi
sample_size = 500
y_pred_sample = np.zeros(sample_size)
pred_interval = []
for i in range(y_test.shape[0]):
for t in range(sample_size):
# generate parameters from the posterior distribution
gen_model.gen_params()
# generate a sample according to the generated parameters
_,_,y_pred_sample[t] = gen_model.gen_sample(
x_continuous=x_test_continuous[i],
x_categorical=x_test_categorical[i],
)
pred_interval.append(hdi(y_pred_sample,0.95,multimodal=True).T)
print(f'{i}/{y_test.shape[0]-1}',end='\r')
220/220
[12]:
print(f'Avg. frequency: {calc_frequency(y_test,pred_interval)}')
print(f'Avg. length: {calc_length(pred_interval)}')
Avg. frequency: 0.9638009049773756
Avg. length: 2.8378308334394333