Prediction interval calculation by bayesml.metatree (accepted at AISTATS 2025)#

Installation of BayesML#

Installation from PyPI will be available soon. Until then, please follow these steps:

  1. Clone BayesML from bayesml/BayesML.

  2. Change the directory and move to the root of the cloned BayesML folder where setup.py exists. (e.g., /Users/UserName/Documents/GitHub/BayesML)

  3. Run pip install .

Load dataset#

[1]:
import pandas as pd
from sklearn.datasets import load_diabetes
diabetes = load_diabetes(as_frame=True)
df = diabetes.frame
df['intercept'] = 1.0 # Add intercept column
df.head()
[1]:
age sex bmi bp s1 s2 s3 s4 s5 s6 target intercept
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019907 -0.017646 151.0 1.0
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068332 -0.092204 75.0 1.0
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 -0.002592 0.002861 -0.025930 141.0 1.0
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022688 -0.009362 206.0 1.0
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031988 -0.046641 135.0 1.0

Preprocessing#

[2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

x_continuous = df[["age","bmi","bp","s1","s2","s3","s4","s5","s6","intercept"]].to_numpy()
x_categorical = np.where(df[['sex']]>0, 1, 0) # Convert to binary
y = df['target'].to_numpy()

(x_train_continuous,x_test_continuous,
 x_train_categorical,x_test_categorical,
 y_train,y_test) = train_test_split(x_continuous,
                                    x_categorical,
                                    y,
                                    test_size=0.5,random_state=0)

scaler = StandardScaler()
x_train_continuous[:,:-1] = scaler.fit_transform(x_train_continuous[:,:-1])
x_test_continuous[:,:-1] = scaler.transform(x_test_continuous[:,:-1])
y_train = scaler.fit_transform(y_train[:,np.newaxis])[:,0]
y_test = scaler.transform(y_test[:,np.newaxis])[:,0]

Create model instance#

First, calculate model constants.

[3]:
# feature dimensions
continuous_dim = x_train_continuous.shape[1]
categorical_dim = x_train_categorical.shape[1]

# ranges for continuous features
# these ranges will be recursively bisected
ranges = np.empty([continuous_dim,2])
ranges[:,0] = x_train_continuous.min(axis=0)
ranges[:,1] = x_train_continuous.max(axis=0)

# number of assignments for each feature on a path
# -1 means that the feature can be assigned any number of times
num_assignment_vec = -np.ones(continuous_dim+categorical_dim,dtype=int)
num_assignment_vec[continuous_dim-1] = 0 # intercept term is never assigned

Then, create a model instance based on the model constants.

[4]:
from bayesml import metatree,linearregression

# create model
model = metatree.LearnModel(
    c_dim_continuous=continuous_dim,
    c_dim_categorical=categorical_dim,
    c_max_depth=10,
    c_num_assignment_vec=num_assignment_vec,
    c_ranges=ranges,
    h0_g=0.75,
    SubModel=linearregression, # leaf model is linear regression
    sub_constants={'c_degree':continuous_dim}, # linear regression degree
    sub_h0_params={'h0_mu_vec':np.zeros(continuous_dim), # hyperparameters for normal-gamma prior
                   'h0_lambda_mat':np.eye(continuous_dim),
                   'h0_alpha':2.1,
                   'h0_beta':1.0},
)

Calculate posterior distribution#

[5]:
model.update_posterior(
    x_train_continuous,
    x_train_categorical,
    y_train,
    alg_type='REMTMCMC',
    burn_in=100,
    num_metatrees=500,
    threshold_type='sample_midpoint',
    seed=0,
    g_max=0.9,
    num_chains=8,
)
brun_in: 100
99
num_metatrees: 500
499
[5]:
<bayesml.metatree._metatree.LearnModel at 0x14f49d0d0>

Calculate prediction interval#

First, define subroutines to calculate 100(1-alpha)% prediction interval from the values of the probability density function of the predictive distribution. The initial range is determined by the mean and the variance of the predictive distribution.

[6]:
from scipy.integrate import simpson
from scipy.optimize import bisect

def calc_prediction_interval_sub(k,x,y,alpha):
    return simpson(y[y>=k],x=x[:np.count_nonzero(y>=k)])-(1-alpha)

def calc_prediction_interval(model,alpha,num_steps):
    mean = model.make_prediction(loss='squared')
    sd = np.sqrt(model.calc_pred_var())
    l = mean - sd/np.sqrt(alpha) # Lower bound by Chebyshev's inequality
    r = mean + sd/np.sqrt(alpha) # Upper bound by Chebyshev's inequality
    x = np.linspace(l,r,num_steps)
    y = model.calc_pred_density(x)
    k_max = y.max()
    k = bisect(calc_prediction_interval_sub,0,k_max,(x,y,alpha))
    l_list = []
    r_list = []
    for i in range(len(x)-1):
        if y[i] < k and k <= y[i+1]:
            l_list.append(x[i+1])
        if y[i] >= k and k > y[i+1]:
            r_list.append(x[i])

    return l_list,r_list

Calculate prediction interval by using the above subroutines.

[7]:
pred_interval = []
for i in range(y_test.shape[0]):
    # calculate predictive distribution for the i-th test data
    model.calc_pred_dist(
        x_test_continuous[i],
        x_test_categorical[i],
    )
    # calculate pretiction interval
    pred_interval.append(calc_prediction_interval(model,0.05,1000))
    print(f'{i}/{y_test.shape[0]-1}',end='\r')
220/220

Evaluation#

First, define subroutines for evaluation.

[8]:
# Calculate frequency that the true value is in the prediction interval
def calc_frequency(y,pred_intervals):
    tmp_frequency = 0
    for i,interval in enumerate(pred_intervals):
        for j in range(len(interval[0])):
            if interval[0][j] < y[i] and y[i] < interval[1][j]:
                tmp_frequency += 1

    return tmp_frequency/len(pred_intervals)

# Calculate length of the prediction interval
def calc_length(pred_intervals):
    tmp_length = 0
    for interval in pred_intervals:
        for i in range(len(interval[0])):
            tmp_length += (interval[1][i]-interval[0][i])

    return tmp_length/len(pred_intervals)

Evaluate the results. Note that frequency close to 100(1-alpha)% (here, 0.95) and short length are desireble.

[9]:
print(f'Avg. frequency: {calc_frequency(y_test,pred_interval)}')
print(f'Avg. length: {calc_length(pred_interval)}')
Avg. frequency: 0.9366515837104072
Avg. length: 2.700600606247248

Calculate prediction interval by sampling#

Using metatree.GenModel, prediction intervals can be calculated from a sample from the posterior predictive distribution.

[10]:
gen_model = metatree.GenModel(
    c_dim_continuous=x_train_continuous.shape[1],
    c_dim_categorical=x_train_categorical.shape[1],
    c_max_depth=10,
    c_num_assignment_vec=num_assignment_vec,
    c_ranges=ranges,
    h_g=0.75,
    SubModel=linearregression,
    sub_constants={'c_degree':x_train_continuous.shape[1]},
    sub_h_params={'h_mu_vec':np.zeros(continuous_dim),
                   'h_lambda_mat':np.eye(continuous_dim),
                   'h_alpha':2.1,
                   'h_beta':1.0},
    seed=0,
)

# set hyperparameters for the generative model as those of the learned model
gen_model.set_h_params(
    h_metatree_list=model.hn_metatree_list,
    h_metatree_prob_vec=model.hn_metatree_prob_vec,
)
[10]:
<bayesml.metatree._metatree.GenModel at 0x16b81c610>
[11]:
from arviz import hdi

sample_size = 500
y_pred_sample = np.zeros(sample_size)
pred_interval = []

for i in range(y_test.shape[0]):
    for t in range(sample_size):
        # generate parameters from the posterior distribution
        gen_model.gen_params()
        # generate a sample according to the generated parameters
        _,_,y_pred_sample[t] = gen_model.gen_sample(
            x_continuous=x_test_continuous[i],
            x_categorical=x_test_categorical[i],
        )
    pred_interval.append(hdi(y_pred_sample,0.95,multimodal=True).T)
    print(f'{i}/{y_test.shape[0]-1}',end='\r')
220/220
[12]:
print(f'Avg. frequency: {calc_frequency(y_test,pred_interval)}')
print(f'Avg. length: {calc_length(pred_interval)}')
Avg. frequency: 0.9638009049773756
Avg. length: 2.8378308334394333