Millions of people develop some sort of heart disease every year, and heart disease is the biggest killer of both men and women in the United States and around the world. Statistical analysis has identified many risk factors associated with heart disease, such as age, blood pressure, total cholesterol, diabetes, hypertension, family history of heart disease, obesity, lack of physical exercise, and more.
In this project, I will use the dataset from the Cleveland heart disease dataset (provided by DataCamp) to predict a model that can predict the likelihood of heart disease from its associated factors that are significant.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(yardstick)
##
## Attaching package: 'yardstick'
##
## The following object is masked from 'package:readr':
##
## spec
library(Metrics)
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:yardstick':
##
## accuracy, mae, mape, mase, precision, recall, rmse, smape
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:Metrics':
##
## precision, recall
##
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:purrr':
##
## lift
# Define the file path
file_path <- "/Users/thandarmoe/Library/Mobile Documents/com~apple~CloudDocs/teaching/Cloud/me/R-Language/Practice/Dataset/Cleveland_hd.csv"
# Load the CSV file
hd_data <- read.csv(file_path)
# Inspect the first five rows
head(hd_data, 5)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 1 1 145 233 1 2 150 0 2.3 3 0 6
## 2 67 1 4 160 286 0 2 108 1 1.5 2 3 3
## 3 67 1 4 120 229 0 2 129 1 2.6 2 2 7
## 4 37 1 3 130 250 0 0 187 0 3.5 3 0 3
## 5 41 0 2 130 204 0 2 172 0 1.4 1 0 3
## class
## 1 0
## 2 2
## 3 1
## 4 0
## 5 0
str(hd_data)
## 'data.frame': 303 obs. of 14 variables:
## $ age : int 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : int 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : int 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps: int 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : int 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : int 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : int 2 2 2 0 2 0 2 0 2 2 ...
## $ thalach : int 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : int 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : int 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : int 0 3 2 0 0 0 2 0 1 0 ...
## $ thal : int 6 3 7 3 3 3 3 3 7 7 ...
## $ class : int 0 2 1 0 0 0 3 0 2 1 ...
## (Clening the data)
#to check if there are missing values
sapply(hd_data, function(x) sum(is.na(x)))
## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal class
## 0 0 0 4 2 0
hd_data <- na.omit(hd_data)
##(Transform the data)
hd_data$sex_n[hd_data$sex == "0"] <- "female"
hd_data$sex_n[hd_data$sex == "1"] <- "male"
hd_data$cp_n[hd_data$cp == "1"] <- "ty angina"
hd_data$cp_n[hd_data$cp == "2"] <- "atyp angina"
hd_data$cp_n[hd_data$cp == "3"] <- "non-ang pain"
hd_data$cp_n[hd_data$cp == "4"] <- "asym"
hd_data$fbs_n[hd_data$fbs == "0"] <- "no high glu"
hd_data$fbs_n[hd_data$fbs == "1"] <- "high glu"
hd_data$restecg_n[hd_data$restecg == "0"] <- "normal"
hd_data$restecg_n[hd_data$restecg == "1"] <- "ST-T abn"
hd_data$restecg_n[hd_data$restecg == "2"] <- "LVH"
hd_data$exang_n[hd_data$exang == "0"] <- "no exang"
hd_data$exang_n[hd_data$exang == "1"] <- "exang"
hd_data$slope_n[hd_data$slope == "1"] <- "up"
hd_data$slope_n[hd_data$slope == "2"] <- "flat"
hd_data$slope_n[hd_data$slope == "3"] <- "down"
hd_data$thal_n[hd_data$thal == "3"] <- "normal"
hd_data$thal_n[hd_data$thal == "6"] <- "fixed defect"
hd_data$thal_n[hd_data$thal == "7"] <- "rev defect"
hd_data$class_n <- ifelse(hd_data$class == 0, 0, 1)
# factorize
hd_data$sex_n <- factor(hd_data$sex_n, levels = c("female", "male"))
hd_data$cp_n <- factor(hd_data$cp_n, levels = c("ty angina", "atyp angina","non-ang pain", "asym"))
hd_data$fbs_n <- factor(hd_data$fbs_n, levels = c("no high glu", "high glu"))
hd_data$restecg_n <- factor(hd_data$restecg_n, levels = c("normal", "ST-T abn", "LVH"))
hd_data$exang_n <- factor(hd_data$exang_n, levels = c("no exang", "exang"))
hd_data$slope_n <- factor(hd_data$slope_n, levels = c("up", "flat","down"))
hd_data$thal_n <- factor(hd_data$thal_n, levels = c("normal", "fixed defect","rev defect"))
str(hd_data)
## 'data.frame': 297 obs. of 22 variables:
## $ age : int 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : int 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : int 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps : int 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : int 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : int 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : int 2 2 2 0 2 0 2 0 2 2 ...
## $ thalach : int 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : int 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : int 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : int 0 3 2 0 0 0 2 0 1 0 ...
## $ thal : int 6 3 7 3 3 3 3 3 7 7 ...
## $ class : int 0 2 1 0 0 0 3 0 2 1 ...
## $ sex_n : Factor w/ 2 levels "female","male": 2 2 2 2 1 2 1 1 2 2 ...
## $ cp_n : Factor w/ 4 levels "ty angina","atyp angina",..: 1 4 4 3 2 2 4 4 4 4 ...
## $ fbs_n : Factor w/ 2 levels "no high glu",..: 2 1 1 1 1 1 1 1 1 2 ...
## $ restecg_n: Factor w/ 3 levels "normal","ST-T abn",..: 3 3 3 1 3 1 3 1 3 3 ...
## $ exang_n : Factor w/ 2 levels "no exang","exang": 1 2 2 1 1 1 1 2 1 2 ...
## $ slope_n : Factor w/ 3 levels "up","flat","down": 3 2 2 3 1 1 3 1 2 3 ...
## $ thal_n : Factor w/ 3 levels "normal","fixed defect",..: 2 1 3 1 1 1 1 1 3 3 ...
## $ class_n : num 0 1 1 0 0 0 1 0 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:6] 88 167 193 267 288 303
## ..- attr(*, "names")= chr [1:6] "88" "167" "193" "267" ...
head(hd_data,5)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 1 1 145 233 1 2 150 0 2.3 3 0 6
## 2 67 1 4 160 286 0 2 108 1 1.5 2 3 3
## 3 67 1 4 120 229 0 2 129 1 2.6 2 2 7
## 4 37 1 3 130 250 0 0 187 0 3.5 3 0 3
## 5 41 0 2 130 204 0 2 172 0 1.4 1 0 3
## class sex_n cp_n fbs_n restecg_n exang_n slope_n thal_n
## 1 0 male ty angina high glu LVH no exang down fixed defect
## 2 2 male asym no high glu LVH exang flat normal
## 3 1 male asym no high glu LVH exang flat rev defect
## 4 0 male non-ang pain no high glu normal no exang down normal
## 5 0 female atyp angina no high glu LVH no exang up normal
## class_n
## 1 0
## 2 1
## 3 1
## 4 0
## 5 0
## Stepwise Multiple Regression Analysis
mylogit <- glm (class_n ~ age+sex_n+cp_n+trestbps+chol+fbs_n+restecg_n+thalach+exang_n+oldpeak+slope_n+ca+thal_n, data = hd_data, family = "binomial")
summary(mylogit)
##
## Call:
## glm(formula = class_n ~ age + sex_n + cp_n + trestbps + chol +
## fbs_n + restecg_n + thalach + exang_n + oldpeak + slope_n +
## ca + thal_n, family = "binomial", data = hd_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.029468 2.891768 -2.085 0.03707 *
## age -0.013763 0.024745 -0.556 0.57808
## sex_nmale 1.546014 0.529995 2.917 0.00353 **
## cp_natyp angina 1.239566 0.770874 1.608 0.10784
## cp_nnon-ang pain 0.245959 0.663312 0.371 0.71078
## cp_nasym 2.086480 0.666547 3.130 0.00175 **
## trestbps 0.024364 0.011269 2.162 0.03062 *
## chol 0.004448 0.003993 1.114 0.26526
## fbs_nhigh glu -0.596246 0.607848 -0.981 0.32664
## restecg_nST-T abn 0.810202 2.435102 0.333 0.73935
## restecg_nLVH 0.473895 0.383518 1.236 0.21659
## thalach -0.017723 0.011109 -1.595 0.11065
## exang_nexang 0.709456 0.440018 1.612 0.10689
## oldpeak 0.357875 0.230070 1.556 0.11983
## slope_nflat 1.155286 0.473794 2.438 0.01475 *
## slope_ndown 0.525147 0.919661 0.571 0.56798
## ca 1.311510 0.279276 4.696 2.65e-06 ***
## thal_nfixed defect -0.010974 0.790210 -0.014 0.98892
## thal_nrev defect 1.392715 0.425194 3.275 0.00105 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.95 on 296 degrees of freedom
## Residual deviance: 191.64 on 278 degrees of freedom
## AIC: 229.64
##
## Number of Fisher Scoring iterations: 6
#remove the least significant variables
mylogit <- glm (class_n ~ age+sex_n+cp_n+trestbps+chol+fbs_n+restecg_n+thalach+exang_n+oldpeak+slope_n+ca+thal_n, data = hd_data, family = "binomial")
summary(mylogit)
##
## Call:
## glm(formula = class_n ~ age + sex_n + cp_n + trestbps + chol +
## fbs_n + restecg_n + thalach + exang_n + oldpeak + slope_n +
## ca + thal_n, family = "binomial", data = hd_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.029468 2.891768 -2.085 0.03707 *
## age -0.013763 0.024745 -0.556 0.57808
## sex_nmale 1.546014 0.529995 2.917 0.00353 **
## cp_natyp angina 1.239566 0.770874 1.608 0.10784
## cp_nnon-ang pain 0.245959 0.663312 0.371 0.71078
## cp_nasym 2.086480 0.666547 3.130 0.00175 **
## trestbps 0.024364 0.011269 2.162 0.03062 *
## chol 0.004448 0.003993 1.114 0.26526
## fbs_nhigh glu -0.596246 0.607848 -0.981 0.32664
## restecg_nST-T abn 0.810202 2.435102 0.333 0.73935
## restecg_nLVH 0.473895 0.383518 1.236 0.21659
## thalach -0.017723 0.011109 -1.595 0.11065
## exang_nexang 0.709456 0.440018 1.612 0.10689
## oldpeak 0.357875 0.230070 1.556 0.11983
## slope_nflat 1.155286 0.473794 2.438 0.01475 *
## slope_ndown 0.525147 0.919661 0.571 0.56798
## ca 1.311510 0.279276 4.696 2.65e-06 ***
## thal_nfixed defect -0.010974 0.790210 -0.014 0.98892
## thal_nrev defect 1.392715 0.425194 3.275 0.00105 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.95 on 296 degrees of freedom
## Residual deviance: 191.64 on 278 degrees of freedom
## AIC: 229.64
##
## Number of Fisher Scoring iterations: 6
mylogit <- glm (class_n ~ age+sex_n+cp_n+trestbps+thalach+exang_n+oldpeak+slope_n+ca+thal_n, data = hd_data, family = "binomial")
summary(mylogit)
##
## Call:
## glm(formula = class_n ~ age + sex_n + cp_n + trestbps + thalach +
## exang_n + oldpeak + slope_n + ca + thal_n, family = "binomial",
## data = hd_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.109337 2.778797 -1.839 0.065961 .
## age -0.008822 0.024274 -0.363 0.716275
## sex_nmale 1.406704 0.499174 2.818 0.004831 **
## cp_natyp angina 1.288744 0.758338 1.699 0.089238 .
## cp_nnon-ang pain 0.200958 0.649997 0.309 0.757194
## cp_nasym 2.162558 0.658355 3.285 0.001021 **
## trestbps 0.023991 0.010808 2.220 0.026437 *
## thalach -0.016402 0.010884 -1.507 0.131803
## exang_nexang 0.659848 0.433832 1.521 0.128266
## oldpeak 0.388212 0.222838 1.742 0.081486 .
## slope_nflat 1.209694 0.465448 2.599 0.009350 **
## slope_ndown 0.453737 0.893315 0.508 0.611506
## ca 1.264412 0.268225 4.714 2.43e-06 ***
## thal_nfixed defect -0.240894 0.762266 -0.316 0.751984
## thal_nrev defect 1.365777 0.414886 3.292 0.000995 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.95 on 296 degrees of freedom
## Residual deviance: 195.85 on 282 degrees of freedom
## AIC: 225.85
##
## Number of Fisher Scoring iterations: 6
mylogit <- glm (class_n ~ sex_n+cp_n+trestbps+oldpeak+slope_n+ca+thal_n, data = hd_data, family = "binomial")
summary(mylogit)
##
## Call:
## glm(formula = class_n ~ sex_n + cp_n + trestbps + oldpeak + slope_n +
## ca + thal_n, family = "binomial", data = hd_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.101290 1.793804 -4.516 6.29e-06 ***
## sex_nmale 1.337975 0.477314 2.803 0.005061 **
## cp_natyp angina 1.310229 0.766155 1.710 0.087241 .
## cp_nnon-ang pain 0.283613 0.660958 0.429 0.667855
## cp_nasym 2.571164 0.653209 3.936 8.28e-05 ***
## trestbps 0.022347 0.010340 2.161 0.030676 *
## oldpeak 0.439985 0.215974 2.037 0.041629 *
## slope_nflat 1.491612 0.442915 3.368 0.000758 ***
## slope_ndown 0.623749 0.837960 0.744 0.456655
## ca 1.279213 0.254868 5.019 5.19e-07 ***
## thal_nfixed defect -0.007463 0.732901 -0.010 0.991876
## thal_nrev defect 1.498453 0.405023 3.700 0.000216 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.95 on 296 degrees of freedom
## Residual deviance: 201.57 on 285 degrees of freedom
## AIC: 225.57
##
## Number of Fisher Scoring iterations: 6
## Forward Selection
fullmodel <- glm (class_n ~ age+sex_n+cp_n+trestbps+chol+fbs_n+restecg_n+thalach+exang_n+oldpeak+slope_n+ca+thal_n, data = hd_data, family = "binomial")
nothing <- glm(class_n ~ 1, data = hd_data, family = "binomial")
forwards = step(nothing, scope = list(lower=formula(nothing), upper=formula(fullmodel)), direction = "forward")
## Start: AIC=411.95
## class_n ~ 1
##
## Df Deviance AIC
## + thal_n 2 323.39 329.39
## + cp_n 3 328.75 336.75
## + ca 1 339.42 343.42
## + oldpeak 1 350.48 354.48
## + thalach 1 351.97 355.97
## + exang_n 1 355.48 359.48
## + slope_n 2 365.16 371.16
## + sex_n 1 386.12 390.12
## + age 1 394.25 398.25
## + restecg_n 2 400.28 406.28
## + trestbps 1 402.88 406.88
## <none> 409.95 411.95
## + chol 1 408.03 412.03
## + fbs_n 1 409.94 413.94
##
## Step: AIC=329.39
## class_n ~ thal_n
##
## Df Deviance AIC
## + ca 1 273.59 281.59
## + cp_n 3 275.86 287.86
## + thalach 1 288.27 296.27
## + oldpeak 1 294.27 302.27
## + exang_n 1 295.84 303.84
## + slope_n 2 300.98 310.98
## + age 1 312.61 320.61
## + restecg_n 2 311.48 321.48
## + sex_n 1 320.64 328.64
## + trestbps 1 320.68 328.68
## + chol 1 320.89 328.89
## <none> 323.39 329.39
## + fbs_n 1 322.99 330.99
##
## Step: AIC=281.59
## class_n ~ thal_n + ca
##
## Df Deviance AIC
## + cp_n 3 237.05 251.05
## + exang_n 1 248.64 258.64
## + thalach 1 250.87 260.87
## + slope_n 2 252.04 264.04
## + oldpeak 1 254.15 264.15
## + restecg_n 2 264.70 276.70
## + sex_n 1 270.43 280.43
## + fbs_n 1 270.57 280.57
## <none> 273.59 281.59
## + trestbps 1 271.62 281.62
## + chol 1 272.94 282.94
## + age 1 273.06 283.06
##
## Step: AIC=251.05
## class_n ~ thal_n + ca + cp_n
##
## Df Deviance AIC
## + oldpeak 1 221.59 237.59
## + slope_n 2 219.63 237.63
## + thalach 1 227.03 243.03
## + exang_n 1 228.41 244.41
## + restecg_n 2 230.69 248.69
## + sex_n 1 233.16 249.16
## + trestbps 1 233.65 249.65
## <none> 237.05 251.05
## + chol 1 236.11 252.11
## + age 1 236.30 252.30
## + fbs_n 1 236.82 252.82
##
## Step: AIC=237.59
## class_n ~ thal_n + ca + cp_n + oldpeak
##
## Df Deviance AIC
## + slope_n 2 213.14 233.14
## + thalach 1 216.43 234.43
## + exang_n 1 216.47 234.47
## + sex_n 1 217.76 235.76
## + restecg_n 2 216.46 236.46
## + trestbps 1 219.12 237.12
## <none> 221.59 237.59
## + chol 1 220.98 238.98
## + age 1 221.19 239.19
## + fbs_n 1 221.47 239.47
##
## Step: AIC=233.14
## class_n ~ thal_n + ca + cp_n + oldpeak + slope_n
##
## Df Deviance AIC
## + sex_n 1 206.48 228.48
## + exang_n 1 209.41 231.41
## + trestbps 1 209.98 231.98
## <none> 213.14 233.14
## + thalach 1 211.14 233.14
## + restecg_n 2 209.49 233.49
## + chol 1 212.81 234.81
## + fbs_n 1 213.04 235.04
## + age 1 213.10 235.10
##
## Step: AIC=228.48
## class_n ~ thal_n + ca + cp_n + oldpeak + slope_n + sex_n
##
## Df Deviance AIC
## + trestbps 1 201.57 225.57
## + exang_n 1 202.74 226.74
## + thalach 1 203.88 227.88
## <none> 206.48 228.48
## + chol 1 204.94 228.94
## + restecg_n 2 203.28 229.28
## + age 1 205.95 229.95
## + fbs_n 1 206.26 230.26
##
## Step: AIC=225.57
## class_n ~ thal_n + ca + cp_n + oldpeak + slope_n + sex_n + trestbps
##
## Df Deviance AIC
## + exang_n 1 198.25 224.25
## + thalach 1 198.34 224.34
## <none> 201.57 225.57
## + chol 1 200.47 226.47
## + fbs_n 1 200.87 226.87
## + restecg_n 2 199.39 227.39
## + age 1 201.53 227.53
##
## Step: AIC=224.25
## class_n ~ thal_n + ca + cp_n + oldpeak + slope_n + sex_n + trestbps +
## exang_n
##
## Df Deviance AIC
## + thalach 1 195.98 223.98
## <none> 198.25 224.25
## + chol 1 197.12 225.12
## + fbs_n 1 197.27 225.27
## + restecg_n 2 196.13 226.13
## + age 1 198.20 226.20
##
## Step: AIC=223.98
## class_n ~ thal_n + ca + cp_n + oldpeak + slope_n + sex_n + trestbps +
## exang_n + thalach
##
## Df Deviance AIC
## <none> 195.98 223.98
## + chol 1 194.47 224.47
## + fbs_n 1 195.13 225.13
## + age 1 195.85 225.85
## + restecg_n 2 193.96 225.96
##Backward Selection
fullmodel <- glm (class_n ~ age+sex_n+cp_n+trestbps+chol+fbs_n+restecg_n+thalach+exang_n+oldpeak+slope_n+ca+thal_n, data = hd_data, family = "binomial")
backwards = step(fullmodel, direction = "backward")
## Start: AIC=229.64
## class_n ~ age + sex_n + cp_n + trestbps + chol + fbs_n + restecg_n +
## thalach + exang_n + oldpeak + slope_n + ca + thal_n
##
## Df Deviance AIC
## - restecg_n 2 193.24 227.24
## - age 1 191.95 227.95
## - fbs_n 1 192.63 228.63
## - chol 1 192.86 228.86
## <none> 191.64 229.64
## - oldpeak 1 194.15 230.15
## - exang_n 1 194.21 230.21
## - thalach 1 194.28 230.28
## - slope_n 2 197.96 231.96
## - trestbps 1 196.53 232.53
## - sex_n 1 200.86 236.86
## - thal_n 2 203.93 237.93
## - cp_n 3 210.79 242.79
## - ca 1 220.00 256.00
##
## Step: AIC=227.24
## class_n ~ age + sex_n + cp_n + trestbps + chol + fbs_n + thalach +
## exang_n + oldpeak + slope_n + ca + thal_n
##
## Df Deviance AIC
## - age 1 193.50 225.50
## - fbs_n 1 194.20 226.20
## - chol 1 195.01 227.01
## <none> 193.24 227.24
## - oldpeak 1 195.69 227.69
## - exang_n 1 195.77 227.77
## - thalach 1 195.98 227.98
## - slope_n 2 200.43 230.43
## - trestbps 1 198.76 230.76
## - thal_n 2 204.85 234.85
## - sex_n 1 203.63 235.63
## - cp_n 3 212.30 240.30
## - ca 1 222.28 254.28
##
## Step: AIC=225.5
## class_n ~ sex_n + cp_n + trestbps + chol + fbs_n + thalach +
## exang_n + oldpeak + slope_n + ca + thal_n
##
## Df Deviance AIC
## - fbs_n 1 194.47 224.47
## - chol 1 195.13 225.13
## <none> 193.50 225.50
## - thalach 1 195.99 225.99
## - oldpeak 1 196.09 226.09
## - exang_n 1 196.13 226.13
## - slope_n 2 200.57 228.57
## - trestbps 1 198.78 228.78
## - thal_n 2 205.07 233.07
## - sex_n 1 204.50 234.50
## - cp_n 3 212.97 238.97
## - ca 1 223.55 253.55
##
## Step: AIC=224.47
## class_n ~ sex_n + cp_n + trestbps + chol + thalach + exang_n +
## oldpeak + slope_n + ca + thal_n
##
## Df Deviance AIC
## - chol 1 195.98 223.98
## <none> 194.47 224.47
## - exang_n 1 196.79 224.79
## - thalach 1 197.12 225.12
## - oldpeak 1 197.48 225.48
## - slope_n 2 201.16 227.16
## - trestbps 1 199.16 227.16
## - thal_n 2 206.79 232.79
## - sex_n 1 204.91 232.91
## - cp_n 3 216.92 240.92
## - ca 1 223.56 251.56
##
## Step: AIC=223.98
## class_n ~ sex_n + cp_n + trestbps + thalach + exang_n + oldpeak +
## slope_n + ca + thal_n
##
## Df Deviance AIC
## <none> 195.98 223.98
## - thalach 1 198.25 224.25
## - exang_n 1 198.34 224.34
## - oldpeak 1 199.23 225.23
## - trestbps 1 201.11 227.11
## - slope_n 2 203.20 227.20
## - sex_n 1 205.07 231.07
## - thal_n 2 209.17 233.17
## - cp_n 3 218.57 240.57
## - ca 1 225.74 251.74
After running step-wise, forward, and backward selection, step-wise selection is parsimonious and biological more plausible, and accordingly, it is selected.
Predict the outcome of developing heart disease using the model.
Set the threshold to 0.5, meaning that if the predicted probability exceeds 0.5, it will regard as heart disease.
# Confusion Matrix
# Generate predictions using the logistic regression model
hd_data$glm.probs <- predict(mylogit, type = "response")
# Classify predictions as "hd_ds" or "no hd_ds" based on a threshold of 0.5
hd_data$glm.pred <- ifelse(hd_data$glm.probs > 0.5, "hd_ds", "no hd_ds")
# Convert the classification results to a factor
hd_data$glm.pred <- factor(hd_data$glm.pred, levels = c("hd_ds", "no hd_ds"))
# Display the first few rows of the birth dataframe
head(hd_data, 5) %>% select(class_n, glm.probs, glm.pred)
## class_n glm.probs glm.pred
## 1 0 0.1307061 no hd_ds
## 2 1 0.9953795 hd_ds
## 3 1 0.9944144 hd_ds
## 4 0 0.1960954 no hd_ds
## 5 0 0.0366164 no hd_ds
# Actual outcomes
actual <- hd_data$class_n
prediction <- ifelse(hd_data$glm.probs > 0.5, "1", "0")
# Accuracy
accuracy <- Metrics::accuracy(actual, prediction)
accuracy
## [1] 0.8518519
# Generate the actual and prediction labels
results <- tibble(
truth = factor(ifelse(actual == 1, "hd_ds", "no_hd_ds"), levels = c("no_hd_ds", "hd_ds")),
estimate = factor(ifelse(prediction == 1, "hd_ds", "no_hd_ds"), levels = c("no_hd_ds", "hd_ds"))
)
# Confusion matrix
confusion <- yardstick::conf_mat(results, truth = truth, estimate = estimate)
confusion
## Truth
## Prediction no_hd_ds hd_ds
## no_hd_ds 142 26
## hd_ds 18 111
True Positive: 111
False Positive: 18
True Negative: 142
False Negative: 26
Sensitivity: TP/(TP+FN) = 0.81
Specificity: TN/(TN+FP) = 0.88
actual <- factor(hd_data$class_n, levels = c("0", "1"))
predicted <- factor(prediction, levels = c("0", "1"))
confusionMatrix(predicted, actual, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 142 26
## 1 18 111
##
## Accuracy : 0.8519
## 95% CI : (0.8063, 0.8902)
## No Information Rate : 0.5387
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7007
##
## Mcnemar's Test P-Value : 0.2913
##
## Sensitivity : 0.8102
## Specificity : 0.8875
## Pos Pred Value : 0.8605
## Neg Pred Value : 0.8452
## Prevalence : 0.4613
## Detection Rate : 0.3737
## Detection Prevalence : 0.4343
## Balanced Accuracy : 0.8489
##
## 'Positive' Class : 1
##
The model achieves high specificity (88.8%), meaning it is good at identifying patients without heart disease.
The sensitivity (81.1%) indicates it is reasonably good at identifying patients with heart disease, but there is room for improvement.
The accuracy (85.1%) shows the model performs well overall, correctly predicting the class for a large majority of cases.