I would like to build a predictive model to predict the following numerical label self.varname_label = ['SUMMED_ALLCAUSE_NUM_POST2YR', 'SUMMED_DXTARGET_NUM_POST2YR', 'SUMMED_ALLCAUSE_COST_POST2YR', 'SUMMED_DXTARGET_COST_POST2YR']
. These refers to 1) the number of healthcare visits post index, 2) the number of healthcare visits for a specific medical condition post index, 3) the cost of healthcare visits post index, and 4) the cost of healthcare visits for a specific medical condition post index, respectively. Each time, I will only use one of the above labels, as in self.y_label = self.varname_label[0]
.
I have both numerical and categorical features, and will require transformation. I used this tutorial as guide on how to transform/standardize the features in one step using the ColumnTransformer
.
In the model to predict SUMMED_ALLCAUSE_NUM_POST2YR
, as a test I try to duplicate this outcome label as an additional feature as SUMMED_ALLCAUSE_NUM_POST2YR_DUP
, which I expect will give close to perfect prediction (i.e., Variance score
should be close to 1.0). However, I only get Variance score = 0.06
.
import pandas as pd
import numpy as np
import ctypes
import re
import pickle
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import helper_functions.helper_functions as hf
import sec1_data_preparation as data_prep
import sec2_prepped_data_import as prepped_data_import
# Main class
######################################################################
class Machine_Learning_ProjectX(data_prep.DataPreparation_ProjectX):
def __init__(self):
self.pickle_descriptive_stats_demographic = None
self.pickle_descriptive_stats_clinical = None
self.pickle_descriptive_stats_rx = None
self.pickle_descriptive_stats_csu = None
self.df_demographic = None
self.df_clinical = None
self.df_rx = None
self.df_csu = None
self.df_master = None
self.varname_cat_all = ['INDEX_RURAL_CAT', 'INDEX_SEX', 'AIDS_TAG', 'CHF_TAG', 'CKD_TAG', 'CLD_MILD_TAG', 'CLD_SEVERE_TAG',
'COPD_TAG', 'CTD_TAG', 'CVA_TAG', 'DM_MILD_TAG', 'DM_SEVERE_TAG', 'METS_TAG', 'MI_TAG', 'PUD_TAG',
'PVD_TAG', 'DEMENTIA_TAG', 'HEMIPLEGIA_TAG', 'TUMOR_TAG', 'INDEX_DIN_CAT']
self.varname_cat_used = ['INDEX_RURAL_CAT', 'INDEX_SEX']
self.varname_num_all = ['INDEX_AGE', 'CCI_SCORE', 'PREINDEX1YR_N_DRUGX_FG_MPR', 'PREINDEX1YR_N_DRUGX_SG_MPR', 'PREINDEX1YR_N_DRUGY_TYPICAL_MPR',
'PREINDEX1YR_N_DRUGY_ATYPICAL_MPR', 'POSTINDEX1YR_N_DRUGX_FG_MPR', 'POSTINDEX1YR_N_DRUGX_SG_MPR',
'POSTINDEX1YR_N_DRUGY_TYPICAL_MPR', 'POSTINDEX1YR_N_DRUGY_ATYPICAL_MPR',
'SUMMED_ALLCAUSE_NUM_PRE2YR', 'SUMMED_ALLCAUSE_NUM_POST2YR', 'SUMMED_ALLCAUSE_COST_PRE2YR',
'SUMMED_ALLCAUSE_COST_POST2YR', 'SUMMED_DXTARGET_NUM_PRE2YR', 'SUMMED_DXTARGET_NUM_POST2YR',
'SUMMED_DXTARGET_COST_PRE2YR', 'SUMMED_DXTARGET_COST_POST2YR', 'DAD_ALLCAUSE_NUM_PRE2YR',
'DAD_ALLCAUSE_NUM_POST2YR', 'DAD_ALLCAUSE_COST_PRE2YR', 'DAD_ALLCAUSE_COST_POST2YR',
'DAD_DXTARGET_NUM_PRE2YR', 'DAD_DXTARGET_NUM_POST2YR', 'DAD_DXTARGET_COST_PRE2YR',
'DAD_DXTARGET_COST_POST2YR', 'PC_ALLCAUSE_NUM_PRE2YR', 'PC_ALLCAUSE_NUM_POST2YR',
'PC_ALLCAUSE_COST_PRE2YR', 'PC_ALLCAUSE_COST_POST2YR', 'PC_DXTARGET_NUM_PRE2YR',
'PC_DXTARGET_NUM_POST2YR', 'PC_DXTARGET_COST_PRE2YR', 'PC_DXTARGET_COST_POST2YR',
'NACRS_ALLCAUSE_NUM_PRE2YR', 'NACRS_ALLCAUSE_NUM_POST2YR', 'NACRS_ALLCAUSE_COST_PRE2YR',
'NACRS_ALLCAUSE_COST_POST2YR', 'NACRS_DXTARGET_NUM_PRE2YR', 'NACRS_DXTARGET_NUM_POST2YR',
'NACRS_DXTARGET_COST_PRE2YR', 'NACRS_DXTARGET_COST_POST2YR']
self.varname_num_used = ['INDEX_AGE', 'CCI_SCORE', 'SUMMED_ALLCAUSE_NUM_PRE2YR', 'SUMMED_DXTARGET_NUM_PRE2YR',
'SUMMED_ALLCAUSE_COST_PRE2YR', 'SUMMED_DXTARGET_COST_PRE2YR']
self.varname_id = ['PHN_ENC', 'INDEX_DATE']
self.varname_label = ['SUMMED_ALLCAUSE_NUM_POST2YR', 'SUMMED_DXTARGET_NUM_POST2YR', 'SUMMED_ALLCAUSE_COST_POST2YR',
'SUMMED_DXTARGET_COST_POST2YR']
self.y_label = self.varname_label[0]
self.varname_import = list(set(self.varname_id+self.varname_cat_used+self.varname_num_used))+[self.y_label]
def ml_steps(self):
self.import_references()
self.import_pickle_descriptive_stats_demographic(on_switch=True,
import_dir=self.result_dir,
import_filename='ProjectX_V2_SubjectGroup_DescriptiveStats_Demographic.pickle')
self.import_pickle_descriptive_stats_clinical(on_switch=True,
import_dir=self.result_dir,
import_filename='ProjectX_V2_SubjectGroup_DescriptiveStats_Clinical.pickle')
self.import_pickle_descriptive_stats_rx(on_switch=True,
import_dir=self.result_dir,
import_filename='ProjectX_V2_SubjectGroup_DescriptiveStats_Rx.pickle')
self.import_pickle_descriptive_stats_csu(on_switch=True,
import_dir=self.result_dir,
import_filename='ProjectX_V2_SubjectGroup_DescriptiveStats_CSU.pickle')
self.import_df_demographic(on_switch=True,
import_dir=self.result_dir,
import_filename='ProjectX_V2_SubjectGroup_DF_Demographic_SubjectLevel.csv')
self.import_df_clinical(on_switch=True,
import_dir=self.result_dir,
import_filename='ProjectX_V2_SubjectGroup_DF_Clinical_SubjectLevel.csv')
self.import_df_rx(on_switch=True,
import_dir=self.result_dir,
import_filename='ProjectX_V2_SubjectGroup_DF_Rx_SubjectLevel.csv')
self.import_df_csu(on_switch=True,
import_dir=self.result_dir,
import_filename='ProjectX_V2_SubjectGroup_DF_CSU_SubjectLevel.csv')
self.merge_dfs(on_switch=True)
self.visualize_descriptive_stats(on_switch=False)
self.split_into_training_and_test_sets(on_switch=True)
self.generate_new_feature(on_switch=False)
self.handle_missing_value(on_switch=True)
self.standardize_value(on_switch=True)
self.ml_pipeline(on_switch=True)
self.report_result(on_switch=True)
def import_references(self):
super().__init__()
super()._pandas_output_setting()
super().dir_name()
super().file_name()
super().constant_var()
super().import_ref_data()
# Decorators
def on_or_off(func):
def wrapper(self, *args, on_switch=False, **kwargs):
if on_switch:
func(self, *args, on_switch=on_switch, **kwargs)
return wrapper
# Core class functions
@on_or_off
def import_pickle_descriptive_stats_demographic(self, on_switch, import_dir=None, import_filename=None):
with open(import_dir+import_filename, 'rb') as handle:
self.pickle_descriptive_stats_demographic = pickle.load(handle)
@on_or_off
def import_pickle_descriptive_stats_clinical(self, on_switch, import_dir=None, import_filename=None):
with open(import_dir+import_filename, 'rb') as handle:
self.pickle_descriptive_stats_clinical = pickle.load(handle)
@on_or_off
def import_pickle_descriptive_stats_rx(self, on_switch, import_dir=None, import_filename=None):
with open(import_dir+import_filename, 'rb') as handle:
self.pickle_descriptive_stats_rx = pickle.load(handle)
@on_or_off
def import_pickle_descriptive_stats_csu(self, on_switch, import_dir=None, import_filename=None):
with open(import_dir+import_filename, 'rb') as handle:
self.pickle_descriptive_stats_csu = pickle.load(handle)
@on_or_off
def import_df_demographic(self, on_switch, import_dir=None, import_filename=None):
self.df_demographic = pd.read_csv(import_dir+import_filename, dtype={'PHN_ENC':'str'})
@on_or_off
def import_df_clinical(self, on_switch, import_dir=None, import_filename=None):
self.df_clinical = pd.read_csv(import_dir+import_filename, dtype={'PHN_ENC':'str'})
@on_or_off
def import_df_rx(self, on_switch, import_dir=None, import_filename=None):
self.df_rx = pd.read_csv(import_dir+import_filename, dtype={'PHN_ENC':'str'})
@on_or_off
def import_df_csu(self, on_switch, import_dir=None, import_filename=None):
self.df_csu = pd.read_csv(import_dir+import_filename, dtype={'PHN_ENC':'str'})
@on_or_off
def merge_dfs(self, on_switch):
self.df_master = self.df_demographic.copy()
self.df_master = self.df_master.merge(self.df_clinical, on='PHN_ENC', how='outer')
self.df_master = self.df_master.merge(self.df_rx, on='PHN_ENC', how='outer')
self.df_master = self.df_master.merge(self.df_csu, on='PHN_ENC', how='outer')
assert (len(self.df_master)==self.df_master['PHN_ENC'].nunique()), 'Error: Same subject appears on multiple rows.'
# Remove duplicated columns
self.df_master = self.df_master.loc[:,~self.df_master.columns.str.contains('_y', case=True)]
self.df_master.columns = self.df_master.columns.str.replace('_x', '')
self.df_master = self.df_master.loc[:,~self.df_master.columns.duplicated()]
# Remove unused columns
self.df_master = self.df_master.loc[:, ~self.df_master.columns.str.contains('^Unnamed')]
self.df_master = self.df_master.drop(['temp'], axis=1)
# Retain only needed columns
self.df_master = self.df_master[self.varname_import]
# For testing
self.df_master['SUMMED_ALLCAUSE_NUM_POST2YR_DUP'] = self.df_master['SUMMED_ALLCAUSE_NUM_POST2YR']
@on_or_off
def visualize_descriptive_stats(self, on_switch):
scatter_plot = self.df_master.plot.scatter( x='INDEX_AGE',
y=self.y_label,
c='DarkBlue')
plt.show()
@on_or_off
def split_into_training_and_test_sets(self, on_switch):
feature_set = self.df_master.drop([self.y_label], axis=1)
X_label = feature_set.columns[:len(feature_set.columns)]
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.df_master[X_label],
self.df_master[self.y_label], test_size=0.33, random_state=888)
self.X_train = self.X_train.drop(['PHN_ENC', 'INDEX_DATE'], axis=1)
self.X_test = self.X_test.drop(['PHN_ENC', 'INDEX_DATE'], axis=1)
@on_or_off
def generate_new_feature(self, on_switch):
pass
@on_or_off
def handle_missing_value(self, on_switch):
self.X_train = self.X_train.apply(lambda x:x.fillna(x.value_counts().index[0]))
self.X_test = self.X_test.apply(lambda x:x.fillna(x.value_counts().index[0]))
self.y_train = self.y_train.fillna(0)
self.y_test = self.y_test.fillna(0)
@on_or_off
def standardize_value(self, on_switch):
var_value_pairs = {
'INDEX_RURAL_CAT':['URBAN', 'RURAL'],
'INDEX_SEX':['M', 'F'],
'AIDS_TAG':['AIDS', 'NON-AIDS'],
'CHF_TAG':['CHF', 'NON-CHF'],
'CKD_TAG':['CKD', 'NON-CKD'],
'CLD_MILD_TAG':['CLD_MILD', 'NON-CLD_MILD'],
'CLD_SEVERE_TAG':['CLD_SEVERE', 'NON-CLD_SEVERE'],
'COPD_TAG':['COPD', 'NON-COPD'],
'CTD_TAG':['CTD', 'NON-CTD'],
'CVA_TAG':['CVA', 'NON-CVA'],
'DM_MILD_TAG':['DM_MILD', 'NON-DM_MILD'],
'DM_SEVERE_TAG':['DM_SEVERE', 'NON-DM_SEVERE'],
'METS_TAG':['METS', "NON-METS"],
'MI_TAG':['MI', 'NON-MI'],
'PUD_TAG':['PUD', 'NON-PUD'],
'PVD_TAG':['PVD', 'NON-PVD'],
'DEMENTIA_TAG':['DEMENTIA', 'NON-DEMENTIA'],
'HEMIPLEGIA_TAG':['HEMIPLEGIA', 'NON-HEMIPLEGIA'],
'TUMOR_TAG':['TUMOR', 'NON-TUMOR'],
'INDEX_DIN_CAT':['ARIPIPRAZOLE', 'RISPERIDONE', 'PALIPERIDONE'],
}
def extract_values_from_var_cat_used():
var_cat_used_categories = []
for varname in self.varname_cat_used:
for key, val in var_value_pairs.items():
if varname == key:
var_cat_used_categories.append(val)
return var_cat_used_categories
var_cat_used_categories = extract_values_from_var_cat_used()
colT = ColumnTransformer(
[ ('CATE_COL', OneHotEncoder(categories=var_cat_used_categories), self.varname_cat_used),
('NORM_COL', Normalizer(norm='l1'), self.varname_num_used+['SUMMED_ALLCAUSE_NUM_POST2YR_DUP']) # For testing
])
self.X_train = colT.fit_transform(self.X_train)
self.X_test = colT.transform(self.X_test)
@on_or_off
def ml_pipeline(self, on_switch):
self.regressor = LinearRegression()
self.regressor.fit(self.X_train, self.y_train) # training the algorithm
self.y_pred = self.regressor.predict(self.X_test) # making prediction
@on_or_off
def report_result(self, on_switch):
# The coefficients
##print('Coefficients: \n', self.regressor.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(self.y_test, self.y_pred))
# ExpDRUGXned variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(self.y_test, self.y_pred))
# Helper functions
######################################################################
# Main function
######################################################################
def main():
x = Machine_Learning_ProjectX()
x.ml_steps()
if __name__ == '__main__':
main()
When I remove the SUMMED_ALLCAUSE_NUM_POST2YR_DUP
as feature by replace the ('NORM_COL', Normalizer(norm='l1'), self.varname_num_used+['SUMMED_ALLCAUSE_NUM_POST2YR_DUP'])
with ('NORM_COL', Normalizer(norm='l1'), self.varname_num_used)
, the Variance score
is -0.01.
I did another test which includes removing SUMMED_ALLCAUSE_NUM_POST2YR_DUP
, but use one of the other labels as feature for example self.varname_num_used = ['INDEX_AGE', 'CCI_SCORE', 'SUMMED_ALLCAUSE_NUM_PRE2YR', 'SUMMED_DXTARGET_NUM_PRE2YR', 'SUMMED_ALLCAUSE_COST_PRE2YR', 'SUMMED_DXTARGET_COST_PRE2YR', 'SUMMED_ALLCAUSE_COST_POST2YR']
, this time Variance score
is 0.29.
Does it seem strange that replicating the label variable as a feature performs only 6% better than the base model, while using another label as additional feature gives a 29% boost? Also the performance of the base model of Variance score
being -0.01 is also strange to me since I expect the feature set to have some decent predictive value.