Machine Learning Reflective Essay Code

'''
ML Project for Module:
BAA10127 - Data Analytics: Machine Learning & Advanced Python
Student No. 21311696
Student Name: Rory James Mulhern
Course: BSI4
Dataset: https://www.kaggle.com/datasets/taweilo/loan-approval-classification-data
'''

'\nML Project for Module:\nBAA10127 - Data Analytics: Machine Learning & Advanced Python\nStudent No. 21311696\nStudent Name: Rory James Mulhern\nCourse: BSI4\nDataset: https://www.kaggle.com/datasets/taweilo/loan-approval-classification-data\n'

# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

# Linking file to Code
filepath = '/Users/mulhr/Desktop/ML Project/loan_data.csv'

# Importing the dataset
loans_df = pd.read_csv(filepath)

loans_df

	person_age	person_gender	person_education	person_income	person_emp_exp	person_home_ownership	loan_amnt	loan_intent	loan_int_rate	loan_percent_income	cb_person_cred_hist_length	credit_score	previous_loan_defaults_on_file	loan_status
0	22.0	female	Master	71948.0	0	RENT	35000.0	PERSONAL	16.02	0.49	3.0	561	No	1
1	21.0	female	High School	12282.0	0	OWN	1000.0	EDUCATION	11.14	0.08	2.0	504	Yes	0
2	25.0	female	High School	12438.0	3	MORTGAGE	5500.0	MEDICAL	12.87	0.44	3.0	635	No	1
3	23.0	female	Bachelor	79753.0	0	RENT	35000.0	MEDICAL	15.23	0.44	2.0	675	No	1
4	24.0	male	Master	66135.0	1	RENT	35000.0	MEDICAL	14.27	0.53	4.0	586	No	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
44995	27.0	male	Associate	47971.0	6	RENT	15000.0	MEDICAL	15.66	0.31	3.0	645	No	1
44996	37.0	female	Associate	65800.0	17	RENT	9000.0	HOMEIMPROVEMENT	14.07	0.14	11.0	621	No	1
44997	33.0	male	Associate	56942.0	7	RENT	2771.0	DEBTCONSOLIDATION	10.02	0.05	10.0	668	No	1
44998	29.0	male	Bachelor	33164.0	4	RENT	12000.0	EDUCATION	13.23	0.36	6.0	604	No	1
44999	24.0	male	High School	51609.0	1	RENT	6665.0	DEBTCONSOLIDATION	17.05	0.13	3.0	628	No	1

45000 rows × 14 columns

# Youssef Elbadry Accessed: 9th April 2025

# Looking at info on the data
loans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  45000 non-null  object 
 13  loan_status                     45000 non-null  int64  
dtypes: float64(6), int64(3), object(5)
memory usage: 4.8+ MB

# Changing the person_age column to an integer
loans_df['person_age'] = loans_df['person_age'].astype(int)

# Looking at info on the data
loans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  int64  
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  45000 non-null  object 
 13  loan_status                     45000 non-null  int64  
dtypes: float64(5), int64(4), object(5)
memory usage: 4.8+ MB

# Removing duplicate rows
loans_df.drop_duplicates(inplace=True)

# Check if there are any duplicates left
duplicate_count = loans_df.duplicated().sum()

# Display final check
if duplicate_count == 0:
    print("No duplicate values in the dataset.")
else:
    print(f"Total duplicate values remaining: {duplicate_count}")

No duplicate values in the dataset.

# Looking at the data description see the statistics of numeric columns
loans_df.describe().T

	count	mean	std	min	25%	50%	75%	max
person_age	45000.0	27.764178	6.045108	20.00	24.00	26.00	30.00	144.00
person_income	45000.0	80319.053222	80422.498632	8000.00	47204.00	67048.00	95789.25	7200766.00
person_emp_exp	45000.0	5.410333	6.063532	0.00	1.00	4.00	8.00	125.00
loan_amnt	45000.0	9583.157556	6314.886691	500.00	5000.00	8000.00	12237.25	35000.00
loan_int_rate	45000.0	11.006606	2.978808	5.42	8.59	11.01	12.99	20.00
loan_percent_income	45000.0	0.139725	0.087212	0.00	0.07	0.12	0.19	0.66
cb_person_cred_hist_length	45000.0	5.867489	3.879702	2.00	3.00	4.00	8.00	30.00
credit_score	45000.0	632.608756	50.435865	390.00	601.00	640.00	670.00	850.00
loan_status	45000.0	0.222222	0.415744	0.00	0.00	0.00	0.00	1.00

# Youssef Elbadry Accessed: 9th April 2025

# Seeing which columns are Categorical and Numerical
cat_cols = [var for var in loans_df.columns if loans_df[var].dtypes == 'object']
num_cols = [var for var in loans_df.columns if loans_df[var].dtypes != 'object']

print(f'Categorical columns: {cat_cols}')
print(f'Numerical columns: {num_cols}')

Categorical columns: ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']
Numerical columns: ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score', 'loan_status']

cat_cols

['person_gender',
 'person_education',
 'person_home_ownership',
 'loan_intent',
 'previous_loan_defaults_on_file']

# Seeing the split in gender
loans_df['person_gender'].value_counts()

person_gender
male      24841
female    20159
Name: count, dtype: int64

# Youssef Elbadry Accessed: 9th April 2025
def plot_categorical_column(dataframe, column):

    plt.figure(figsize=(7, 7))
    ax = sns.countplot(x=dataframe[column])
    total_count = len(dataframe[column])
    threshold = 0.05 * total_count
    category_counts = dataframe[column].value_counts(normalize=True) * 100
    ax.axhline(threshold, color='red', linestyle='--', label=f'0.05% of total count ({threshold:.0f})')
    
    for p in ax.patches:
        height = p.get_height()
        percentage = (height / total_count) * 100
        ax.text(p.get_x() + p.get_width() / 2., height + 0.02 * total_count, f'{percentage:.2f}%', ha="center")
    
    plt.title(f'Label Cardinality for "{column}" Column')
    plt.ylabel('Count')
    plt.xlabel(column)
    plt.tight_layout()
    
    plt.legend()
    plt.show()

for col in cat_cols:
    plot_categorical_column(loans_df, col)

loans_df[num_cols].hist(bins=30, figsize=(12,10))
plt.show()

label_prop = loans_df['loan_status'].value_counts()

plt.pie(label_prop.values, labels=['Rejected (0)', 'Approved (1)'], autopct='%.2f')
plt.title('Target label proportions')
plt.show()

'''
Article saying most lenders will not lend to anyone above 70
https://www.moneysupermarket.com/loans/loans-for-pensioners/#:~:text=Most%20lenders%20have%20a%20maximum,beyond%20this%20age%20is%20rare.
'''
loans_df = loans_df[loans_df['person_age']<= 70]
print('Ages above 70 removed!')

Ages above 70 removed!

loans_df[num_cols].hist(bins=30, figsize=(12,10))
plt.show()

# Sulani Ishara Accessed: 14th April 2025
numerical_columns = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']

fig, axes = plt.subplots(4, 2, figsize=(16, 20))
fig.suptitle('Numerical Features vs Loan Status (Density Plots)', fontsize=16)

for i, col in enumerate(numerical_columns):
    sns.kdeplot(data=loans_df, x=col, hue='loan_status', ax=axes[i//2, i%2], fill=True, common_norm=False, palette='muted')
    axes[i//2, i%2].set_title(f'{col} vs Loan Status')
    axes[i//2, i%2].set_xlabel(col)
    axes[i//2, i%2].set_ylabel('Density')

fig.delaxes(axes[3, 1])

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# Box and Whisker plot to see what the outliers in the dataset look like
# Sulani Ishara Accessed: 14th April 2025

# Function to perform univariate analysis for numeric columns
def univariate_analysis(data, column, title):
    plt.figure(figsize=(10, 2))
    
    sns.boxplot(x=data[column], color='sandybrown')
    plt.title(f'{title} Boxplot')
    
    plt.tight_layout()
    plt.show()

    print(f'\nSummary Statistics for {title}:\n', data[column].describe())

columns_to_analyse = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']

for column in columns_to_analyse:
    univariate_analysis(loans_df, column, column.replace('_', ' '))


Summary Statistics for person age:
 count    44985.000000
mean        27.739335
std          5.870099
min         20.000000
25%         24.000000
50%         26.000000
75%         30.000000
max         70.000000
Name: person_age, dtype: float64


Summary Statistics for person income:
 count    4.498500e+04
mean     7.991017e+04
std      6.332666e+04
min      8.000000e+03
25%      4.719200e+04
50%      6.704600e+04
75%      9.578200e+04
max      2.448661e+06
Name: person_income, dtype: float64


Summary Statistics for person emp exp:
 count    44985.000000
mean         5.385351
std          5.886303
min          0.000000
25%          1.000000
50%          4.000000
75%          8.000000
max         50.000000
Name: person_emp_exp, dtype: float64


Summary Statistics for loan amnt:
 count    44985.000000
mean      9583.638368
std       6315.056351
min        500.000000
25%       5000.000000
50%       8000.000000
75%      12238.000000
max      35000.000000
Name: loan_amnt, dtype: float64


Summary Statistics for loan int rate:
 count    44985.000000
mean        11.006678
std          2.979087
min          5.420000
25%          8.590000
50%         11.010000
75%         12.990000
max         20.000000
Name: loan_int_rate, dtype: float64


Summary Statistics for loan percent income:
 count    44985.000000
mean         0.139743
std          0.087210
min          0.000000
25%          0.070000
50%          0.120000
75%          0.190000
max          0.660000
Name: loan_percent_income, dtype: float64


Summary Statistics for cb person cred hist length:
 count    44985.000000
mean         5.863177
std          3.869127
min          2.000000
25%          3.000000
50%          4.000000
75%          8.000000
max         30.000000
Name: cb_person_cred_hist_length, dtype: float64


Summary Statistics for credit score:
 count    44985.000000
mean       632.569123
std         50.388810
min        390.000000
25%        601.000000
50%        640.000000
75%        670.000000
max        784.000000
Name: credit_score, dtype: float64

from sklearn.preprocessing import RobustScaler
from scipy.stats.mstats import winsorize

for col in ["person_age", "person_income", "person_emp_exp", "loan_amnt"]:
    loans_df[col] = winsorize(loans_df[col], limits=[0.025, 0.025])
# Robust scaling
scaler = RobustScaler()
loans_df[["person_age", "person_income", "person_emp_exp", "loan_amnt"]] = scaler.fit_transform(loans_df[["person_age", "person_income", "person_emp_exp", "loan_amnt"]])

# Box and Whisker plot to see what the outliers in the dataset look like
# Function to perform univariate analysis for numeric columns

for column in columns_to_analyse:
    univariate_analysis(loans_df, column, column.replace('_', ' '))

/var/folders/h5/p6vdg3ps6wn1kgd_w454mf500000gn/T/ipykernel_45976/4078407632.py:5: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/h5/p6vdg3ps6wn1kgd_w454mf500000gn/T/ipykernel_45976/4078407632.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary Statistics for person age:
 count    44985.000000
mean         0.265374
std          0.886138
min         -0.833333
25%         -0.333333
50%          0.000000
75%          0.666667
max          2.833333
Name: person_age, dtype: float64


Summary Statistics for person income:
 count    44985.000000
mean         0.207182
std          0.854803
min         -0.910846
25%         -0.408603
50%          0.000000
75%          0.591397
max          2.888352
Name: person_income, dtype: float64


Summary Statistics for person emp exp:
 count    44985.000000
mean         0.177688
std          0.763888
min         -0.571429
25%         -0.428571
50%          0.000000
75%          0.571429
max          2.428571
Name: person_emp_exp, dtype: float64


Summary Statistics for loan amnt:
 count    44985.000000
mean         0.207831
std          0.833624
min         -0.898038
25%         -0.414479
50%          0.000000
75%          0.585521
max          2.348715
Name: loan_amnt, dtype: float64


Summary Statistics for loan int rate:
 count    44985.000000
mean        11.006678
std          2.979087
min          5.420000
25%          8.590000
50%         11.010000
75%         12.990000
max         20.000000
Name: loan_int_rate, dtype: float64


Summary Statistics for loan percent income:
 count    44985.000000
mean         0.139743
std          0.087210
min          0.000000
25%          0.070000
50%          0.120000
75%          0.190000
max          0.660000
Name: loan_percent_income, dtype: float64


Summary Statistics for cb person cred hist length:
 count    44985.000000
mean         5.863177
std          3.869127
min          2.000000
25%          3.000000
50%          4.000000
75%          8.000000
max         30.000000
Name: cb_person_cred_hist_length, dtype: float64


Summary Statistics for credit score:
 count    44985.000000
mean       632.569123
std         50.388810
min        390.000000
25%        601.000000
50%        640.000000
75%        670.000000
max        784.000000
Name: credit_score, dtype: float64

columns_to_check = ["person_age", "person_income", "person_emp_exp", "loan_amnt"]

for col in columns_to_check:
    skew_val = loans_df[col].skew()
    print(f"{col} skewness: {skew_val:.2f}")

person_age skewness: 1.18
person_income skewness: 1.27
person_emp_exp skewness: 1.23
loan_amnt skewness: 0.94

# Apply log1p directly — it's safe for 0s
for col in columns_to_check:
    loans_df[col] = np.log1p(loans_df[col])

# Recheck skewness
for col in columns_to_check:
    skew_val = loans_df[col].skew()
    print(f"{col} skewness after log1p: {skew_val:.2f}")

for column in columns_to_analyse:
    univariate_analysis(loans_df, column, column.replace('_', ' '))

person_age skewness after log1p: -0.22
person_income skewness after log1p: -0.72
person_emp_exp skewness after log1p: 0.22
loan_amnt skewness after log1p: -0.67

/var/folders/h5/p6vdg3ps6wn1kgd_w454mf500000gn/T/ipykernel_45976/4222552184.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summary Statistics for person age:
 count    44985.000000
mean        -0.010294
std          0.726477
min         -1.791759
25%         -0.405465
50%          0.000000
75%          0.510826
max          1.343735
Name: person_age, dtype: float64


Summary Statistics for person income:
 count    44985.000000
mean        -0.084973
std          0.806123
min         -2.417388
25%         -0.525267
50%          0.000000
75%          0.464613
max          1.357985
Name: person_income, dtype: float64


Summary Statistics for person emp exp:
 count    44985.000000
mean        -0.028691
std          0.616124
min         -0.847298
25%         -0.559616
50%          0.000000
75%          0.451985
max          1.232144
Name: person_emp_exp, dtype: float64


Summary Statistics for loan amnt:
 count    44985.000000
mean        -0.089653
std          0.816802
min         -2.283156
25%         -0.535253
50%          0.000000
75%          0.460913
max          1.208577
Name: loan_amnt, dtype: float64


Summary Statistics for loan int rate:
 count    44985.000000
mean        11.006678
std          2.979087
min          5.420000
25%          8.590000
50%         11.010000
75%         12.990000
max         20.000000
Name: loan_int_rate, dtype: float64


Summary Statistics for loan percent income:
 count    44985.000000
mean         0.139743
std          0.087210
min          0.000000
25%          0.070000
50%          0.120000
75%          0.190000
max          0.660000
Name: loan_percent_income, dtype: float64


Summary Statistics for cb person cred hist length:
 count    44985.000000
mean         5.863177
std          3.869127
min          2.000000
25%          3.000000
50%          4.000000
75%          8.000000
max         30.000000
Name: cb_person_cred_hist_length, dtype: float64


Summary Statistics for credit score:
 count    44985.000000
mean       632.569123
std         50.388810
min        390.000000
25%        601.000000
50%        640.000000
75%        670.000000
max        784.000000
Name: credit_score, dtype: float64

loans_df
loans_df.describe().T

	count	mean	std	min	25%	50%	75%	max
person_age	44985.0	-0.010294	0.726477	-1.791759	-0.405465	0.00	0.510826	1.343735
person_income	44985.0	-0.084973	0.806123	-2.417388	-0.525267	0.00	0.464613	1.357985
person_emp_exp	44985.0	-0.028691	0.616124	-0.847298	-0.559616	0.00	0.451985	1.232144
loan_amnt	44985.0	-0.089653	0.816802	-2.283156	-0.535253	0.00	0.460913	1.208577
loan_int_rate	44985.0	11.006678	2.979087	5.420000	8.590000	11.01	12.990000	20.000000
loan_percent_income	44985.0	0.139743	0.087210	0.000000	0.070000	0.12	0.190000	0.660000
cb_person_cred_hist_length	44985.0	5.863177	3.869127	2.000000	3.000000	4.00	8.000000	30.000000
credit_score	44985.0	632.569123	50.388810	390.000000	601.000000	640.00	670.000000	784.000000
loan_status	44985.0	0.222296	0.415794	0.000000	0.000000	0.00	0.000000	1.000000

# Sulani Ishara Accessed: 14th April 2025
numerical_columns = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']

fig, axes = plt.subplots(4, 2, figsize=(16, 20))
fig.suptitle('Numerical Features vs Loan Status (Density Plots)', fontsize=16)

for i, col in enumerate(numerical_columns):
    sns.kdeplot(data=loans_df, x=col, hue='loan_status', ax=axes[i//2, i%2], fill=True, common_norm=False, palette='muted')
    axes[i//2, i%2].set_title(f'{col} vs Loan Status')
    axes[i//2, i%2].set_xlabel(col)
    axes[i//2, i%2].set_ylabel('Density')

fig.delaxes(axes[3, 1])

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# Making Education into a non-categorical columns
loans_df['person_education'] = loans_df['person_education'].replace({
    'High School': 0,
    'Associate': 1,
    'Bachelor': 2,
    'Master': 3,
    'Doctorate': 4
})

/var/folders/h5/p6vdg3ps6wn1kgd_w454mf500000gn/T/ipykernel_45976/1636345205.py:2: FutureWarning:

Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

/var/folders/h5/p6vdg3ps6wn1kgd_w454mf500000gn/T/ipykernel_45976/1636345205.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

loans_df

	person_age	person_gender	person_education	person_income	person_emp_exp	person_home_ownership	loan_amnt	loan_intent	loan_int_rate	loan_percent_income	cb_person_cred_hist_length	credit_score	previous_loan_defaults_on_file	loan_status
0	-1.098612	female	3	0.096114	-0.847298	RENT	1.208577	PERSONAL	16.02	0.49	3.0	561	No	1
1	-1.791759	female	0	-2.417388	-0.847298	OWN	-2.283156	EDUCATION	11.14	0.08	2.0	504	Yes	0
2	-0.182322	female	0	-2.417388	-0.154151	MORTGAGE	-0.423730	MEDICAL	12.87	0.44	3.0	635	No	1
3	-0.693147	female	2	0.232313	-0.847298	RENT	1.208577	MEDICAL	15.23	0.44	2.0	675	No	1
4	-0.405465	male	3	-0.018927	-0.559616	RENT	1.208577	MEDICAL	14.27	0.53	4.0	586	No	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
44995	0.154151	male	1	-0.498519	0.251314	RENT	0.676570	MEDICAL	15.66	0.31	3.0	645	No	1
44996	1.041454	female	1	-0.025978	1.049822	RENT	0.129413	HOMEIMPROVEMENT	14.07	0.14	11.0	621	No	1
44997	0.773190	male	1	-0.233123	0.356675	RENT	-1.281708	DEBTCONSOLIDATION	10.02	0.05	10.0	668	No	1
44998	0.405465	male	2	-1.195026	0.000000	RENT	0.439956	EDUCATION	13.23	0.36	6.0	604	No	1
44999	-0.405465	male	0	-0.382285	-0.559616	RENT	-0.203884	DEBTCONSOLIDATION	17.05	0.13	3.0	628	No	1

44985 rows × 14 columns

# One-hot coding for dummy variables
loans_df = pd.get_dummies(loans_df, columns = ['person_gender', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file'], drop_first = True)

# Checking the data types
loans_df.dtypes

person_age                            float64
person_education                        int64
person_income                         float64
person_emp_exp                        float64
loan_amnt                             float64
loan_int_rate                         float64
loan_percent_income                   float64
cb_person_cred_hist_length            float64
credit_score                            int64
loan_status                             int64
person_gender_male                       bool
person_home_ownership_OTHER              bool
person_home_ownership_OWN                bool
person_home_ownership_RENT               bool
loan_intent_EDUCATION                    bool
loan_intent_HOMEIMPROVEMENT              bool
loan_intent_MEDICAL                      bool
loan_intent_PERSONAL                     bool
loan_intent_VENTURE                      bool
previous_loan_defaults_on_file_Yes       bool
dtype: object

# Define numerical columns with target
numerical_columns_with_target = [
    'person_age', 
    'person_income', 
    'person_emp_exp', 
    'loan_amnt', 
    'loan_int_rate', 
    'loan_percent_income', 
    'cb_person_cred_hist_length', 
    'credit_score'
]

# Create pairplot of numerical features with loan_status as hue
sns.pairplot(loans_df[numerical_columns_with_target + ['loan_status']], 
             hue='loan_status', 
             palette='muted'
            )
plt.show()

# Getting a correlation matrix
num_loans_df = loans_df.select_dtypes(include=['number']) # Include only numerical data types

# Correlation of that data
corr_matrix = num_loans_df.corr()
print(corr_matrix)

                            person_age  person_education  person_income  \
person_age                    1.000000          0.035695       0.155206   
person_education              0.035695          1.000000       0.010170   
person_income                 0.155206          0.010170       1.000000   
person_emp_exp                0.872665          0.025813       0.115596   
loan_amnt                     0.067378          0.001772       0.412755   
loan_int_rate                 0.014931          0.003674      -0.027007   
loan_percent_income          -0.054326         -0.004378      -0.348179   
cb_person_cred_hist_length    0.769746         -0.004589       0.082971   
credit_score                  0.157042          0.211911       0.024444   
loan_status                  -0.033822         -0.001160      -0.305005   

                            person_emp_exp  loan_amnt  loan_int_rate  \
person_age                        0.872665   0.067378       0.014931   
person_education                  0.025813   0.001772       0.003674   
person_income                     0.115596   0.412755      -0.027007   
person_emp_exp                    1.000000   0.048703       0.019044   
loan_amnt                         0.048703   1.000000       0.095544   
loan_int_rate                     0.019044   0.095544       1.000000   
loan_percent_income              -0.045281   0.611334       0.125301   
cb_person_cred_hist_length        0.763690   0.035027       0.018332   
credit_score                      0.172743   0.007027       0.011539   
loan_status                      -0.026595   0.075708       0.332032   

                            loan_percent_income  cb_person_cred_hist_length  \
person_age                            -0.054326                    0.769746   
person_education                      -0.004378                   -0.004589   
person_income                         -0.348179                    0.082971   
person_emp_exp                        -0.045281                    0.763690   
loan_amnt                              0.611334                    0.035027   
loan_int_rate                          0.125301                    0.018332   
loan_percent_income                    1.000000                   -0.031191   
cb_person_cred_hist_length            -0.031191                    1.000000   
credit_score                          -0.010976                    0.153466   
loan_status                            0.384864                   -0.014299   

                            credit_score  loan_status  
person_age                      0.157042    -0.033822  
person_education                0.211911    -0.001160  
person_income                   0.024444    -0.305005  
person_emp_exp                  0.172743    -0.026595  
loan_amnt                       0.007027     0.075708  
loan_int_rate                   0.011539     0.332032  
loan_percent_income            -0.010976     0.384864  
cb_person_cred_hist_length      0.153466    -0.014299  
credit_score                    1.000000    -0.007235  
loan_status                    -0.007235     1.000000

# Visual the Correlation Matrix
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Variables')
plt.show()

# Drop Person Employment Experience and Age
loans_df = loans_df.drop(columns=['person_emp_exp','person_age'])
loans_df

	person_education	person_income	loan_amnt	loan_int_rate	loan_percent_income	cb_person_cred_hist_length	credit_score	loan_status	person_gender_male	person_home_ownership_OTHER	person_home_ownership_OWN	person_home_ownership_RENT	loan_intent_EDUCATION	loan_intent_HOMEIMPROVEMENT	loan_intent_MEDICAL	loan_intent_PERSONAL	loan_intent_VENTURE	previous_loan_defaults_on_file_Yes
0	3	0.096114	1.208577	16.02	0.49	3.0	561	1	False	False	False	True	False	False	False	True	False	False
1	0	-2.417388	-2.283156	11.14	0.08	2.0	504	0	False	False	True	False	True	False	False	False	False	True
2	0	-2.417388	-0.423730	12.87	0.44	3.0	635	1	False	False	False	False	False	False	True	False	False	False
3	2	0.232313	1.208577	15.23	0.44	2.0	675	1	False	False	False	True	False	False	True	False	False	False
4	3	-0.018927	1.208577	14.27	0.53	4.0	586	1	True	False	False	True	False	False	True	False	False	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
44995	1	-0.498519	0.676570	15.66	0.31	3.0	645	1	True	False	False	True	False	False	True	False	False	False
44996	1	-0.025978	0.129413	14.07	0.14	11.0	621	1	False	False	False	True	False	True	False	False	False	False
44997	1	-0.233123	-1.281708	10.02	0.05	10.0	668	1	True	False	False	True	False	False	False	False	False	False
44998	2	-1.195026	0.439956	13.23	0.36	6.0	604	1	True	False	False	True	True	False	False	False	False	False
44999	0	-0.382285	-0.203884	17.05	0.13	3.0	628	1	True	False	False	True	False	False	False	False	False	False

44985 rows × 18 columns

# Create a new column for custom labels
loans_df['loan_status_label'] = loans_df['loan_status'].map({0: 'Denied (0)', 1: 'Approved (1)'})

# Create a histogram plotting Approved and Denied loans
sns.histplot(
    data=loans_df,
    x='loan_status_label',
    hue='loan_status_label',
    palette={"Denied (0)": "red", "Approved (1)": "green"}
)
plt.title("Amount of Denied and Approved Loans")
plt.xlabel("Loan Status")
plt.ylabel("Count")
plt.show

# Splitting the Dataset into X and Y
X = loans_df.drop(columns=['loan_status', 'loan_status_label'])  
y = loans_df['loan_status']

	person_education	person_income	loan_amnt	loan_int_rate	loan_percent_income	cb_person_cred_hist_length	credit_score	person_gender_male	person_home_ownership_OTHER	person_home_ownership_OWN	person_home_ownership_RENT	loan_intent_EDUCATION	loan_intent_HOMEIMPROVEMENT	loan_intent_MEDICAL	loan_intent_PERSONAL	loan_intent_VENTURE	previous_loan_defaults_on_file_Yes
0	3	0.096114	1.208577	16.02	0.49	3.0	561	False	False	False	True	False	False	False	True	False	False
1	0	-2.417388	-2.283156	11.14	0.08	2.0	504	False	False	True	False	True	False	False	False	False	True
2	0	-2.417388	-0.423730	12.87	0.44	3.0	635	False	False	False	False	False	False	True	False	False	False
3	2	0.232313	1.208577	15.23	0.44	2.0	675	False	False	False	True	False	False	True	False	False	False
4	3	-0.018927	1.208577	14.27	0.53	4.0	586	True	False	False	True	False	False	True	False	False	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
44995	1	-0.498519	0.676570	15.66	0.31	3.0	645	True	False	False	True	False	False	True	False	False	False
44996	1	-0.025978	0.129413	14.07	0.14	11.0	621	False	False	False	True	False	True	False	False	False	False
44997	1	-0.233123	-1.281708	10.02	0.05	10.0	668	True	False	False	True	False	False	False	False	False	False
44998	2	-1.195026	0.439956	13.23	0.36	6.0	604	True	False	False	True	True	False	False	False	False	False
44999	0	-0.382285	-0.203884	17.05	0.13	3.0	628	True	False	False	True	False	False	False	False	False	False

44985 rows × 17 columns

0        1
1        0
2        1
3        1
4        1
        ..
44995    1
44996    1
44997    1
44998    1
44999    1
Name: loan_status, Length: 44985, dtype: int64

# Splitting the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Setting Up 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
lr_accuracy_scores = []

# Loop through each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_resampled, X_test = X.iloc[train_index], X.iloc[test_index]
    y_resampled, y_test = y.iloc[train_index], y.iloc[test_index]

    # --- Model Training ---
    reg_model_lr = LogisticRegression(max_iter=200000, random_state=42)
    reg_model_lr.fit(X_resampled, y_resampled)
    
    # Evaluate the model on the test data
    lr_accuracy = reg_model_lr.score(X_test, y_test)
    lr_accuracy_scores.append(lr_accuracy)
    print(f"Fold {fold} Accuracy: {lr_accuracy:.4f}")
    
print(f"Average Accuracy: {sum(lr_accuracy_scores)/len(lr_accuracy_scores):.4f}")

Fold 1 Accuracy: 0.9000
Fold 2 Accuracy: 0.9046
Fold 3 Accuracy: 0.8998
Fold 4 Accuracy: 0.8991
Fold 5 Accuracy: 0.8993
Fold 6 Accuracy: 0.8993
Fold 7 Accuracy: 0.8993
Fold 8 Accuracy: 0.8962
Fold 9 Accuracy: 0.8980
Fold 10 Accuracy: 0.8986
Average Accuracy: 0.8994

# Setting Up 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
lr2_accuracy_scores = []

# Loop through each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_resampled, X_test = X.iloc[train_index], X.iloc[test_index]
    y_resampled, y_test = y.iloc[train_index], y.iloc[test_index]

    # --- Model Training ---
    reg_model_lr2 = LogisticRegression(max_iter=200000, random_state=42, penalty='l2')
    reg_model_lr2.fit(X_resampled, y_resampled)
    
    # Evaluate the model on the test data
    lr2_accuracy = reg_model_lr2.score(X_test, y_test)
    lr2_accuracy_scores.append(lr2_accuracy)
    print(f"Fold {fold} Accuracy: {lr2_accuracy:.4f}")
    
print(f"Average Accuracy: {sum(lr2_accuracy_scores)/len(lr2_accuracy_scores):.4f}")

Fold 1 Accuracy: 0.9000
Fold 2 Accuracy: 0.9046
Fold 3 Accuracy: 0.8998
Fold 4 Accuracy: 0.8991
Fold 5 Accuracy: 0.8993
Fold 6 Accuracy: 0.8993
Fold 7 Accuracy: 0.8993
Fold 8 Accuracy: 0.8962
Fold 9 Accuracy: 0.8980
Fold 10 Accuracy: 0.8986
Average Accuracy: 0.8994

# Getting the predictions for the Logistic Regression Model
predictions_lr = reg_model_lr.predict(X_test)

# Getting the predictions for the Logistic Regression Model
predictions_lr2 = reg_model_lr2.predict(X_test)

# Compute the evaluation metrics
lr_precision = precision_score(y_test, predictions_lr)
lr_recall = recall_score (y_test, predictions_lr)
lr_f1 = f1_score(y_test, predictions_lr)

# Print out evaluation metrics
print(f"Average Accuracy: {sum(lr_accuracy_scores)/len(lr_accuracy_scores):.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1-Score: {lr_f1:.4f}")

Average Accuracy: 0.8994
Precision: 0.7804
Recall: 0.7570
F1-Score: 0.7685

# Compute the evaluation metrics
lr2_precision = precision_score(y_test, predictions_lr2)
lr2_recall = recall_score (y_test, predictions_lr2)
lr2_f1 = f1_score(y_test, predictions_lr2)

# Print out evaluation metrics
print(f"Average Accuracy: {sum(lr2_accuracy_scores)/len(lr2_accuracy_scores):.4f}")
print(f"Precision: {lr2_precision:.4f}")
print(f"Recall: {lr2_recall:.4f}")
print(f"F1-Score: {lr2_f1:.4f}")

Average Accuracy: 0.8994
Precision: 0.7804
Recall: 0.7570
F1-Score: 0.7685

lr_cm = confusion_matrix(y_test, predictions_lr )
print(lr_cm)

# Define new labels: index 0 -> "Denied", index 1 -> "Approved"
labels = ['Denied', 'Approved']

# Plot the confusion matrix heatmap with the renamed labels
plt.figure(figsize=(8, 6))
sns.heatmap(lr_cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Predicted Denied", "Predicted Approved"],
            yticklabels=["Actual Denied", "Actual Approved"])
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Confusion Matrix (Logistic Regression)", fontsize=14)
plt.show()

[[3285  213]
 [ 243  757]]

# Calculating the AUC-ROC | from one of the tutorials
lr_y_prob = reg_model_lr.predict_proba(X_test)[:, 1]

lr_auc_roc = roc_auc_score(y_test, lr_y_prob)
print(f"AUC-ROC: {lr_auc_roc:.4f}")

AUC-ROC: 0.9552

# Calculating the AUC-ROC | from one of the tutorials
lr2_y_prob = reg_model_lr2.predict_proba(X_test)[:, 1]

lr2_auc_roc = roc_auc_score(y_test, lr2_y_prob)
print(f"AUC-ROC: {lr2_auc_roc:.4f}")

AUC-ROC: 0.9552

# From ChatGPT

# Get false positive rate, true positive rate and thresholds
fpr, tpr, thresholds = roc_curve(y_test, lr_y_prob)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {lr_auc_roc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Diagonal line for random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (Logistic Regression)')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# Setting Up 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
dt_accuracy_scores = []

# Loop through each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_resampled, X_test = X.iloc[train_index], X.iloc[test_index]
    y_resampled, y_test = y.iloc[train_index], y.iloc[test_index]

    # --- Model Training ---
    dt_model = DecisionTreeClassifier(random_state=42)
    dt_model.fit(X_resampled, y_resampled)
    
    # Evaluate the model on the test data
    dt_accuracy = dt_model.score(X_test, y_test)
    dt_accuracy_scores.append(dt_accuracy)
    print(f"Fold {fold} Accuracy: {dt_accuracy:.4f}")
    
print(f"Average Accuracy: {sum(dt_accuracy_scores)/len(dt_accuracy_scores):.4f}")

Fold 1 Accuracy: 0.9029
Fold 2 Accuracy: 0.9111
Fold 3 Accuracy: 0.9089
Fold 4 Accuracy: 0.8929
Fold 5 Accuracy: 0.9004
Fold 6 Accuracy: 0.8991
Fold 7 Accuracy: 0.9024
Fold 8 Accuracy: 0.9020
Fold 9 Accuracy: 0.8984
Fold 10 Accuracy: 0.8986
Average Accuracy: 0.9017

# Getting the predictions for the Decision Tree Model
predictions_dt = dt_model.predict(X_test)

# Compute the evaluation metrics
dt_precision = precision_score(y_test, predictions_dt)
dt_recall = recall_score (y_test, predictions_dt)
dt_f1 = f1_score(y_test, predictions_dt)

# Print out evaluation metrics
print(f"Average Accuracy: {sum(dt_accuracy_scores)/len(dt_accuracy_scores):.4f}")
print(f"Precision: {dt_precision:.4f}")
print(f"Recall: {dt_recall:.4f}")
print(f"F1-Score: {dt_f1:.4f}")

Average Accuracy: 0.9017
Precision: 0.7677
Recall: 0.7800
F1-Score: 0.7738

dt_cm = confusion_matrix(y_test, predictions_dt )
print(dt_cm)

# Define new labels: index 0 -> "Denied", index 1 -> "Approved"
labels = ['Denied', 'Approved']

# Plot the confusion matrix heatmap with the renamed labels
plt.figure(figsize=(8, 6))
sns.heatmap(dt_cm, annot=True, fmt="d", cmap="Blues",  cbar=False,
            xticklabels=["Predicted Denied", "Predicted Approved"],
            yticklabels=["Actual Denied", "Actual Approved"])
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Confusion Matrix (Decision Tree)", fontsize=14)
plt.show()

[[3262  236]
 [ 220  780]]

# Calculating the AUC-ROC | from one of the tutorials
dt_y_prob = dt_model.predict_proba(X_test)[:, 1]

dt_auc_roc = roc_auc_score(y_test, dt_y_prob)
print(f"AUC-ROC: {dt_auc_roc:.4f}")

AUC-ROC: 0.8563

# From ChatGPT

# Get false positive rate, true positive rate and thresholds
fpr, tpr, thresholds = roc_curve(y_test, dt_y_prob)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {dt_auc_roc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Diagonal line for random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (Decsision Tree)')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# Setting Up 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
rf_accuracy_scores = []

# Loop through each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_resampled, X_test = X.iloc[train_index], X.iloc[test_index]
    y_resampled, y_test = y.iloc[train_index], y.iloc[test_index]

    # --- Model Training ---
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_resampled, y_resampled)
    
    # Evaluate the model on the test data
    rf_accuracy = rf_model.score(X_test, y_test)
    rf_accuracy_scores.append(rf_accuracy)
    print(f"Fold {fold} Accuracy: {rf_accuracy:.4f}")
    
print(f"Average Accuracy: {sum(rf_accuracy_scores)/len(rf_accuracy_scores):.4f}")

Fold 1 Accuracy: 0.9249
Fold 2 Accuracy: 0.9349
Fold 3 Accuracy: 0.9307
Fold 4 Accuracy: 0.9275
Fold 5 Accuracy: 0.9280
Fold 6 Accuracy: 0.9340
Fold 7 Accuracy: 0.9311
Fold 8 Accuracy: 0.9293
Fold 9 Accuracy: 0.9273
Fold 10 Accuracy: 0.9289
Average Accuracy: 0.9296

# Setting Up 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
rf2_accuracy_scores = []

# Loop through each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_resampled, X_test = X.iloc[train_index], X.iloc[test_index]
    y_resampled, y_test = y.iloc[train_index], y.iloc[test_index]

    # --- Model Training ---
    rf2_model = RandomForestClassifier(n_estimators=200, 
                                       random_state=42, 
                                       max_depth=8,
                                       min_samples_split=5,
                                       min_samples_leaf=2,
                                       max_features='sqrt',
                                       bootstrap=True)
    rf2_model.fit(X_resampled, y_resampled)
    
    # Evaluate the model on the test data
    rf2_accuracy = rf2_model.score(X_test, y_test)
    rf2_accuracy_scores.append(rf2_accuracy)
    print(f"Fold {fold} Accuracy: {rf2_accuracy:.4f}")
    
print(f"Average Accuracy: {sum(rf2_accuracy_scores)/len(rf2_accuracy_scores):.4f}")

Fold 1 Accuracy: 0.9224
Fold 2 Accuracy: 0.9262
Fold 3 Accuracy: 0.9193
Fold 4 Accuracy: 0.9193
Fold 5 Accuracy: 0.9206
Fold 6 Accuracy: 0.9231
Fold 7 Accuracy: 0.9173
Fold 8 Accuracy: 0.9202
Fold 9 Accuracy: 0.9215
Fold 10 Accuracy: 0.9240
Average Accuracy: 0.9214

# Setting Up 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
rf3_accuracy_scores = []

# Loop through each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_resampled, X_test = X.iloc[train_index], X.iloc[test_index]
    y_resampled, y_test = y.iloc[train_index], y.iloc[test_index]

    # --- Model Training ---
    rf3_model = RandomForestClassifier(n_estimators=200, 
                                       random_state=42, 
                                       max_depth=8,
                                       min_samples_split=5,
                                       min_samples_leaf=2,
                                       max_features='sqrt',
                                       bootstrap=False)
    rf3_model.fit(X_resampled, y_resampled)
    
    # Evaluate the model on the test data
    rf3_accuracy = rf3_model.score(X_test, y_test)
    rf3_accuracy_scores.append(rf3_accuracy)
    print(f"Fold {fold} Accuracy: {rf3_accuracy:.4f}")
    
print(f"Average Accuracy: {sum(rf3_accuracy_scores)/len(rf3_accuracy_scores):.4f}")

Fold 1 Accuracy: 0.9204
Fold 2 Accuracy: 0.9255
Fold 3 Accuracy: 0.9193
Fold 4 Accuracy: 0.9182
Fold 5 Accuracy: 0.9213
Fold 6 Accuracy: 0.9235
Fold 7 Accuracy: 0.9175
Fold 8 Accuracy: 0.9222
Fold 9 Accuracy: 0.9220
Fold 10 Accuracy: 0.9240
Average Accuracy: 0.9214

# Getting the predictions for the Logistic Regression Model
predictions_rf = rf_model.predict(X_test)

# Getting the predictions for the Logistic Regression Model
predictions_rf2 = rf2_model.predict(X_test)

# Getting the predictions for the Logistic Regression Model
predictions_rf3 = rf3_model.predict(X_test)

# Compute the evaluation metrics
rf_precision = precision_score(y_test, predictions_rf)
rf_recall = recall_score (y_test, predictions_rf)
rf_f1 = f1_score(y_test, predictions_rf)

# Print out evaluation metrics
print(f"Average Accuracy: {sum(rf_accuracy_scores)/len(rf_accuracy_scores):.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1-Score: {rf_f1:.4f}")

Average Accuracy: 0.9296
Precision: 0.8972
Recall: 0.7680
F1-Score: 0.8276

# Compute the evaluation metrics
rf2_precision = precision_score(y_test, predictions_rf2)
rf2_recall = recall_score (y_test, predictions_rf2)
rf2_f1 = f1_score(y_test, predictions_rf2)

# Print out evaluation metrics
print(f"Average Accuracy: {sum(rf2_accuracy_scores)/len(rf2_accuracy_scores):.4f}")
print(f"Precision: {rf2_precision:.4f}")
print(f"Recall: {rf2_recall:.4f}")
print(f"F1-Score: {rf2_f1:.4f}")

Average Accuracy: 0.9214
Precision: 0.9175
Recall: 0.7230
F1-Score: 0.8087

# Compute the evaluation metrics
rf3_precision = precision_score(y_test, predictions_rf3)
rf3_recall = recall_score (y_test, predictions_rf3)
rf3_f1 = f1_score(y_test, predictions_rf3)

# Print out evaluation metrics
print(f"Average Accuracy: {sum(rf3_accuracy_scores)/len(rf3_accuracy_scores):.4f}")
print(f"Precision: {rf3_precision:.4f}")
print(f"Recall: {rf3_recall:.4f}")
print(f"F1-Score: {rf3_f1:.4f}")

Average Accuracy: 0.9214
Precision: 0.9218
Recall: 0.7190
F1-Score: 0.8079

rf_cm = confusion_matrix(y_test, predictions_rf)
print(rf_cm)

# Define new labels: index 0 -> "Denied", index 1 -> "Approved"
labels = ['Denied', 'Approved']

# Plot the confusion matrix heatmap with the renamed labels
plt.figure(figsize=(8, 6))
sns.heatmap(rf_cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Predicted Denied", "Predicted Approved"],
            yticklabels=["Actual Denied", "Actual Approved"])
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Confusion Matrix (Random Forest (Untuned))", fontsize=14)
plt.show()

[[3410   88]
 [ 232  768]]

rf2_cm = confusion_matrix(y_test, predictions_rf2)
print(rf2_cm)

# Define new labels: index 0 -> "Denied", index 1 -> "Approved"
labels = ['Denied', 'Approved']

# Plot the confusion matrix heatmap with the renamed labels
plt.figure(figsize=(8, 6))
sns.heatmap(rf2_cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Predicted Denied", "Predicted Approved"],
            yticklabels=["Actual Denied", "Actual Approved"])
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Confusion Matrix (Random Forest (Tuned v1))", fontsize=14)
plt.show()

[[3433   65]
 [ 277  723]]

rf3_cm = confusion_matrix(y_test, predictions_rf2)
print(rf3_cm)

# Define new labels: index 0 -> "Denied", index 1 -> "Approved"
labels = ['Denied', 'Approved']

# Plot the confusion matrix heatmap with the renamed labels
plt.figure(figsize=(8, 6))
sns.heatmap(rf3_cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Predicted Denied", "Predicted Approved"],
            yticklabels=["Actual Denied", "Actual Approved"])
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Confusion Matrix (Random Forest (Tuned v2))", fontsize=14)
plt.show()

[[3433   65]
 [ 277  723]]

# Calculating the AUC-ROC | from one of the tutorials
rf_y_prob = rf_model.predict_proba(X_test)[:, 1]

rf_auc_roc = roc_auc_score(y_test, rf_y_prob)
print(f"AUC-ROC: {rf_auc_roc:.4f}")

AUC-ROC: 0.9747

# Calculating the AUC-ROC | from one of the tutorials
rf2_y_prob = rf2_model.predict_proba(X_test)[:, 1]

rf2_auc_roc = roc_auc_score(y_test, rf2_y_prob)
print(f"AUC-ROC: {rf2_auc_roc:.4f}")

AUC-ROC: 0.9683

# Calculating the AUC-ROC | from one of the tutorials
rf3_y_prob = rf3_model.predict_proba(X_test)[:, 1]

rf3_auc_roc = roc_auc_score(y_test, rf3_y_prob)
print(f"AUC-ROC: {rf3_auc_roc:.4f}")

AUC-ROC: 0.9684

# From ChatGPT

# Get false positive rate, true positive rate and thresholds
fpr, tpr, thresholds = roc_curve(y_test, rf_y_prob)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {rf_auc_roc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Diagonal line for random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (Random Forest (Untuned))')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# Dictionary of model names and predicted probabilities
models_probs = {
    "Random Forest(Untuned)": rf_y_prob,
    "Random Forest(Tuned v1)": rf2_y_prob,
    "Random Forest(Tuned v2)": rf3_y_prob
}

plt.figure(figsize=(10, 8))

# Plot each ROC curve
for name, probs in models_probs.items():
    fpr, tpr, _ = roc_curve(y_test, probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.5f})')

# Plot random guess line
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison of Models (Random Forest)')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# Setting Up 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_accuracy_scores = []

# Loop through each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_resampled, X_test = X.iloc[train_index], X.iloc[test_index]
    y_resampled, y_test = y.iloc[train_index], y.iloc[test_index]

    # --- Model Training ---
    xgb_model = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        eval_metric='logloss',
        random_state=42
    )
    xgb_model.fit(X_train, y_train)
    
    # Evaluate the model on the test data
    xgb_accuracy = xgb_model.score(X_test, y_test)
    xgb_accuracy_scores.append(xgb_accuracy)
    print(f"Fold {fold} Accuracy: {xgb_accuracy:.4f}")
    
print(f"Average Accuracy: {sum(xgb_accuracy_scores)/len(xgb_accuracy_scores):.4f}")

Fold 1 Accuracy: 0.9355
Fold 2 Accuracy: 0.9413
Fold 3 Accuracy: 0.9382
Fold 4 Accuracy: 0.9378
Fold 5 Accuracy: 0.9353
Fold 6 Accuracy: 0.9426
Fold 7 Accuracy: 0.9360
Fold 8 Accuracy: 0.9360
Fold 9 Accuracy: 0.9373
Fold 10 Accuracy: 0.9353
Average Accuracy: 0.9375

# Setting Up 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb2_accuracy_scores = []

# Loop through each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_resampled, X_test = X.iloc[train_index], X.iloc[test_index]
    y_resampled, y_test = y.iloc[train_index], y.iloc[test_index]

    # --- Model Training ---
    xgb2_model = XGBClassifier(
        n_estimators=100,
        max_depth=8,
        learning_rate=0.1,
        eval_metric='logloss',
        random_state=42
    )
    xgb2_model.fit(X_train, y_train)
    
    # Evaluate the model on the test data
    xgb2_accuracy = xgb2_model.score(X_test, y_test)
    xgb2_accuracy_scores.append(xgb2_accuracy)
    print(f"Fold {fold} Accuracy: {xgb2_accuracy:.4f}")
    
print(f"Average Accuracy: {sum(xgb2_accuracy_scores)/len(xgb2_accuracy_scores):.4f}")

Fold 1 Accuracy: 0.9444
Fold 2 Accuracy: 0.9529
Fold 3 Accuracy: 0.9498
Fold 4 Accuracy: 0.9482
Fold 5 Accuracy: 0.9495
Fold 6 Accuracy: 0.9531
Fold 7 Accuracy: 0.9480
Fold 8 Accuracy: 0.9453
Fold 9 Accuracy: 0.9471
Fold 10 Accuracy: 0.9478
Average Accuracy: 0.9486

# Getting the predictions for the Logistic Regression Model
predictions_xgb = xgb_model.predict(X_test)

# Getting the predictions for the Logistic Regression Model
predictions_xgb2 = xgb2_model.predict(X_test)

# Compute the evaluation metrics
xgb_precision = precision_score(y_test, predictions_xgb)
xgb_recall = recall_score (y_test, predictions_xgb)
xgb_f1 = f1_score(y_test, predictions_xgb)

# Print out evaluation metrics
print(f"Average Accuracy: {sum(xgb_accuracy_scores)/len(xgb_accuracy_scores):.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1-Score: {xgb_f1:.4f}")

Average Accuracy: 0.9375
Precision: 0.9098
Recall: 0.7870
F1-Score: 0.8440

# Compute the evaluation metrics
xgb2_precision = precision_score(y_test, predictions_xgb2)
xgb2_recall = recall_score (y_test, predictions_xgb2)
xgb2_f1 = f1_score(y_test, predictions_xgb2)

# Print out evaluation metrics
print(f"Average Accuracy: {sum(xgb2_accuracy_scores)/len(xgb2_accuracy_scores):.4f}")
print(f"Precision: {xgb2_precision:.4f}")
print(f"Recall: {xgb2_recall:.4f}")
print(f"F1-Score: {xgb2_f1:.4f}")

Average Accuracy: 0.9486
Precision: 0.9332
Recall: 0.8240
F1-Score: 0.8752

xgb_cm = confusion_matrix(y_test, predictions_xgb)
print(xgb_cm)

# Define new labels: index 0 -> "Denied", index 1 -> "Approved"
labels = ['Denied', 'Approved']

# Plot the confusion matrix heatmap with the renamed labels
plt.figure(figsize=(8, 6))
sns.heatmap(xgb_cm, annot=True, fmt="d", cmap="Blues",  cbar=False,
            xticklabels=["Predicted Denied", "Predicted Approved"],
            yticklabels=["Actual Denied", "Actual Approved"])
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Confusion Matrix (XGBoost (Untuned))", fontsize=14)
plt.show()

[[3420   78]
 [ 213  787]]

xgb2_cm = confusion_matrix(y_test, predictions_xgb2)
print(xgb2_cm)

# Define new labels: index 0 -> "Denied", index 1 -> "Approved"
labels = ['Denied', 'Approved']

# Plot the confusion matrix heatmap with the renamed labels
plt.figure(figsize=(8, 6))
sns.heatmap(xgb2_cm, annot=True, fmt="d", cmap="Blues",  cbar=False,
            xticklabels=["Predicted Denied", "Predicted Approved"],
            yticklabels=["Actual Denied", "Actual Approved"])
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Confusion Matrix (XGBoost (Tuned))", fontsize=14)
plt.show()

[[3439   59]
 [ 176  824]]

# Calculating the AUC-ROC | from one of the tutorials
xgb_y_prob = xgb_model.predict_proba(X_test)[:, 1]

xgb_auc_roc = roc_auc_score(y_test, xgb_y_prob)
print(f"AUC-ROC: {xgb_auc_roc:.4f}")

AUC-ROC: 0.9810

# Calculating the AUC-ROC | from one of the tutorials
xgb2_y_prob = xgb2_model.predict_proba(X_test)[:, 1]

xgb2_auc_roc = roc_auc_score(y_test, xgb2_y_prob)
print(f"AUC-ROC: {xgb2_auc_roc:.4f}")

AUC-ROC: 0.9868

# From ChatGPT

# Get false positive rate, true positive rate and thresholds
fpr, tpr, thresholds = roc_curve(y_test, xgb_y_prob)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {xgb_auc_roc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Diagonal line for random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (XGBoost (Untuned))')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# Dictionary of model names and predicted probabilities
models_probs = {
    "XGBoost (Tuned)": xgb2_y_prob,
    "XGBoost (Untuned)": xgb_y_prob,
}

plt.figure(figsize=(10, 8))

# Plot each ROC curve
for name, probs in models_probs.items():
    fpr, tpr, _ = roc_curve(y_test, probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.5f})')

# Plot random guess line
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison of Models (XGBoost)')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# Setting Up 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
knn_accuracy_scores = []

# Loop through each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_resampled, X_test = X.iloc[train_index], X.iloc[test_index]
    y_resampled, y_test = y.iloc[train_index], y.iloc[test_index]

    # --- Model Training ---
    knn_model = KNeighborsClassifier(
        n_neighbors=2, 
        weights='uniform', 
        algorithm='auto', 
        leaf_size=30, 
        metric='minkowski'
    )
    knn_model.fit(X_train, y_train)
    
    # Evaluate the model on the test data
    knn_accuracy = knn_model.score(X_test, y_test)
    knn_accuracy_scores.append(knn_accuracy)
    print(f"Fold {fold} Accuracy: {knn_accuracy:.4f}")
    
print(f"Average Accuracy: {sum(knn_accuracy_scores)/len(knn_accuracy_scores):.4f}")

Fold 1 Accuracy: 0.8735
Fold 2 Accuracy: 0.8629
Fold 3 Accuracy: 0.8702
Fold 4 Accuracy: 0.8675
Fold 5 Accuracy: 0.8649
Fold 6 Accuracy: 0.8722
Fold 7 Accuracy: 0.8688
Fold 8 Accuracy: 0.8684
Fold 9 Accuracy: 0.8686
Fold 10 Accuracy: 0.8610
Average Accuracy: 0.8678

# Getting the predictions for the Logistic Regression Model
predictions_knn = knn_model.predict(X_test)

# Compute the evaluation metrics
knn_precision = precision_score(y_test, predictions_knn)
knn_recall = recall_score (y_test, predictions_knn)
knn_f1 = f1_score(y_test, predictions_knn)

# Print out evaluation metrics
print(f"Average Accuracy: {sum(knn_accuracy_scores)/len(xgb_accuracy_scores):.4f}")
print(f"Precision: {knn_precision:.4f}")
print(f"Recall: {knn_recall:.4f}")
print(f"F1-Score: {knn_f1:.4f}")

Average Accuracy: 0.8678
Precision: 0.8947
Recall: 0.4250
F1-Score: 0.5763

knn_cm = confusion_matrix(y_test, predictions_knn)
print(knn_cm)

# Define new labels: index 0 -> "Denied", index 1 -> "Approved"
labels = ['Denied', 'Approved']

# Plot the confusion matrix heatmap with the renamed labels
plt.figure(figsize=(8, 6))
sns.heatmap(knn_cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Predicted Denied", "Predicted Approved"],
            yticklabels=["Actual Denied", "Actual Approved"])
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Confusion Matrix (KNN)", fontsize=14)
plt.show()

[[3448   50]
 [ 575  425]]

# Calculating the AUC-ROC | from one of the tutorials
knn_y_prob = knn_model.predict_proba(X_test)[:, 1]

knn_auc_roc = roc_auc_score(y_test, knn_y_prob)
print(f"AUC-ROC: {knn_auc_roc:.4f}")

AUC-ROC: 0.8882

# Get false positive rate, true positive rate and thresholds
fpr, tpr, thresholds = roc_curve(y_test, knn_y_prob)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {knn_auc_roc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Diagonal line for random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (KNN)')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# Dictionary of model names and predicted probabilities
models_probs = {
    "Logistic Regression": lr_y_prob,
    "Decision Tree": dt_y_prob,
    "Random Forest": rf_y_prob,
    "Random Forest(Tuned v1)": rf2_y_prob,
    "Random Forest(Tuned v2)": rf3_y_prob,
    "XGBoost (Tuned)": xgb2_y_prob,
    "XGBoost (Untuned)": xgb_y_prob,
    "KNN": knn_y_prob

}

plt.figure(figsize=(10, 8))

# Plot each ROC curve
for name, probs in models_probs.items():
    fpr, tpr, _ = roc_curve(y_test, probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

# Plot random guess line
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison of Models (All Models)')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# Print out all evaluation metrics
print("Logistic Regression (Untuned) Model Evaluation Metrics:")
print(f"Average Accuracy: {sum(lr_accuracy_scores)/len(lr_accuracy_scores):.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1-Score: {lr_f1:.4f}")
print(f"AUC-ROC: {lr_auc_roc:.4f}")

print("  ")
print("Logistic Regression (Tuned) Model Evaluation Metrics:")
print(f"Average Accuracy: {sum(lr2_accuracy_scores)/len(lr2_accuracy_scores):.4f}")
print(f"Precision: {lr2_precision:.4f}")
print(f"Recall: {lr2_recall:.4f}")
print(f"F1-Score: {lr2_f1:.4f}")
print(f"AUC-ROC: {lr2_auc_roc:.4f}")

print("  ")
print("Decision Tree Model Evaluation Metrics:")
print(f"Average Accuracy: {sum(dt_accuracy_scores)/len(dt_accuracy_scores):.4f}")
print(f"Precision: {dt_precision:.4f}")
print(f"Recall: {dt_recall:.4f}")
print(f"F1-Score: {dt_f1:.4f}")
print(f"AUC-ROC: {dt_auc_roc:.4f}")

print("  ")
print("Random Forest (Untuned) Model Evaluation Metrics:")
print(f"Average Accuracy: {sum(rf_accuracy_scores)/len(rf_accuracy_scores):.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1-Score: {rf_f1:.4f}")
print(f"AUC-ROC: {rf_auc_roc:.4f}")

print("  ")
print("Random Forest (Tuned v1) Model Evaluation Metrics:")
print(f"Average Accuracy: {sum(rf2_accuracy_scores)/len(rf2_accuracy_scores):.4f}")
print(f"Precision: {rf2_precision:.4f}")
print(f"Recall: {rf2_recall:.4f}")
print(f"F1-Score: {rf2_f1:.4f}")
print(f"AUC-ROC: {rf2_auc_roc:.4f}")

print( "  ")
print("Random Forest (Tuned v2) Model Evaluation Metrics:")
print(f"Average Accuracy: {sum(rf3_accuracy_scores)/len(rf3_accuracy_scores):.4f}")
print(f"Precision: {rf3_precision:.4f}")
print(f"Recall: {rf3_recall:.4f}")
print(f"F1-Score: {rf3_f1:.4f}")
print(f"AUC-ROC: {rf3_auc_roc:.4f}")

print("  ")
print("XGBoost (Untuned) Model Evaluation Metrics:")
print(f"Average Accuracy: {sum(xgb_accuracy_scores)/len(xgb_accuracy_scores):.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1-Score: {xgb_f1:.4f}")
print(f"AUC-ROC: {xgb_auc_roc:.4f}")

print("  ")
print("XGBoost (Tuned) Model Evaluation Metrics:")
print(f"Average Accuracy: {sum(xgb2_accuracy_scores)/len(xgb2_accuracy_scores):.4f}")
print(f"Precision: {xgb2_precision:.4f}")
print(f"Recall: {xgb2_recall:.4f}")
print(f"F1-Score: {xgb2_f1:.4f}")
print(f"AUC-ROC: {xgb2_auc_roc:.4f}")

print("  ")
print("KNN Model Evaluation Metrics:")
print(f"Average Accuracy: {sum(knn_accuracy_scores)/len(xgb_accuracy_scores):.4f}")
print(f"Precision: {knn_precision:.4f}")
print(f"Recall: {knn_recall:.4f}")
print(f"F1-Score: {knn_f1:.4f}")
print(f"AUC-ROC: {knn_auc_roc:.4f}")

Logistic Regression (Untuned) Model Evaluation Metrics:
Average Accuracy: 0.8994
Precision: 0.7804
Recall: 0.7570
F1-Score: 0.7685
AUC-ROC: 0.9552
  
Logistic Regression (Tuned) Model Evaluation Metrics:
Average Accuracy: 0.8994
Precision: 0.7804
Recall: 0.7570
F1-Score: 0.7685
AUC-ROC: 0.9552
  
Decision Tree Model Evaluation Metrics:
Average Accuracy: 0.9017
Precision: 0.7677
Recall: 0.7800
F1-Score: 0.7738
AUC-ROC: 0.8563
  
Random Forest (Untuned) Model Evaluation Metrics:
Average Accuracy: 0.9296
Precision: 0.8972
Recall: 0.7680
F1-Score: 0.8276
AUC-ROC: 0.9747
  
Random Forest (Tuned v1) Model Evaluation Metrics:
Average Accuracy: 0.9214
Precision: 0.9175
Recall: 0.7230
F1-Score: 0.8087
AUC-ROC: 0.9683
  
Random Forest (Tuned v2) Model Evaluation Metrics:
Average Accuracy: 0.9214
Precision: 0.9218
Recall: 0.7190
F1-Score: 0.8079
AUC-ROC: 0.9684
  
XGBoost (Untuned) Model Evaluation Metrics:
Average Accuracy: 0.9375
Precision: 0.9098
Recall: 0.7870
F1-Score: 0.8440
AUC-ROC: 0.9810
  
XGBoost (Tuned) Model Evaluation Metrics:
Average Accuracy: 0.9486
Precision: 0.9332
Recall: 0.8240
F1-Score: 0.8752
AUC-ROC: 0.9868
  
KNN Model Evaluation Metrics:
Average Accuracy: 0.8678
Precision: 0.8947
Recall: 0.4250
F1-Score: 0.5763
AUC-ROC: 0.8882