Skip to content

Commit a24d904

Browse files
committed
Elastic net regression exercise done!
1 parent a0e1c35 commit a24d904

File tree

8 files changed

+1178
-1343
lines changed

8 files changed

+1178
-1343
lines changed

data/.Rhistory

Lines changed: 0 additions & 512 deletions
This file was deleted.

data/Obstetrics_Periodontal_Therapy.csv

Lines changed: 751 additions & 751 deletions
Large diffs are not rendered by default.

data/Obt_Perio_ML.Rdata

42.3 KB
Binary file not shown.

data/make_data.R

Lines changed: 69 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -335,26 +335,27 @@ opt <- opt %>%
335335
opt <- opt %>%
336336
filter(!if_any(all_of(c("Apgar1", "Apgar5", "Birthweight", "Any.SAE.",
337337
"Fetal.congenital.anomaly", "Preg.ended...37.wk", "GA.at.outcome")), is.na)) %>%
338-
select(-c(Prev.preg, Birth.outcome, Drug.Add, Polyhyd, Mom.HIV.status,
339-
X..Vis.Elig, X1st.Miss.Vis, BL.Cortico, O1B1, O1B5, O61, O65,
340-
O81, O85, OTNF1, Oligo)) %>%
341-
select(where(~sum(is.na(.)) < NAcutoff))
338+
dplyr::select(-c(Prev.preg, Use.Alc, Drug.Add, Birth.outcome, Polyhyd,
339+
Mom.HIV.status, V3.Cortico, BL.Cortico, V5.Cortico,
340+
O1B1, O1B5, O61, O65, O81,
341+
O85, OTNF1, OTNF5, Oligo)) %>%
342+
dplyr::select(where(~sum(is.na(.)) < NAcutoff))
342343

343344

344345
# Make combined race/ethnicity variable (short integer form)
345346
# Vary rare race/ethnicity are removed (e.g. not enough data points)
346347
Race <- opt %>%
347348
group_by(combin) %>%
348349
summarise(each = n()) %>%
349-
filter(each >= 20) %>% # The cutoff of 7 was picked based on results of summary output
350-
select(-each) %>%
350+
dplyr::filter(each >= 20) %>% # The cutoff of 7 was picked based on results of summary output
351+
dplyr::select(-each) %>%
351352
mutate(Race = as.character(1:nrow(.)))
352353

353354

354355
# Joining new short race/ethnicity variable with full dataset, remove redundant columns.
355356
opt <- left_join(opt, Race) %>%
356-
filter(!is.na(Race)) %>%
357-
select(-c(combin, Black:Hisp)) %>%
357+
dplyr::filter(!is.na(Race)) %>%
358+
dplyr::select(-c(combin, Black:Hisp)) %>%
358359
relocate(Race, .after = Age)
359360

360361

@@ -370,40 +371,27 @@ opt <- opt %>%
370371
mutate(N.PAL.sites = as.factor(ifelse(N.PAL.sites >= 2 , "3-33", as.character(N.PAL.sites))))
371372

372373

374+
outvars <- c("PID", "Apgar1", "Apgar5", "Birthweight", "GA.at.outcome", "Any.SAE.", "Preg.ended...37.wk")
373375

376+
outcomes <- opt %>%
377+
dplyr::select(outvars)
374378

375-
# Remove ID (should not be used for imputation)
376-
#Outcomes <- c(
377-
# "PID",
378-
# "Apgar1",
379-
# "Apgar5",
380-
# "Any.SAE.",
381-
# "Birthweight",
382-
# "Fetal.congenital.anomaly",
383-
# "Preg.ended...37.wk",
384-
# "GA...1st.SAE",
385-
# "GA.at.outcome")
386-
387-
#optOut <- opt %>%
388-
# select(all_of(Outcomes))
389-
390-
#opt <- opt %>%
391-
# select(-all_of(Outcomes))
379+
opt <- opt %>%
380+
dplyr::select(-outvars)
392381

393-
PID <- opt %>%
394-
select(PID)
395382

396383

397384
# Pattern of missingness
398-
md.pattern(opt[,-1], rotate.names = TRUE)
385+
md.pattern(opt, rotate.names = TRUE)
399386

400387
# Check the methods used for imputing each variable
401-
init <- mice(opt[,-1], maxit=0)
388+
init <- mice(opt, maxit=0)
402389
meth <- init$method
403390
meth
404391

405-
# Impute missing values
406-
optImp <- mice(opt[,-1], maxit=10, method = meth, seed = 1234)
392+
# Impute missing values - AND YES I KNOW I AM USING THE OUTCOME VARIABLES AS WELL, BAD BUT I NEED THE
393+
optImp <- mice(opt, maxit=10, method = meth, seed = 1234)
394+
407395

408396

409397

@@ -430,8 +418,12 @@ stripplot(optImp, OCRP5, col=c("grey",mdc(2)),pch=c(1,20))
430418
#optImp <- bind_cols(optOut, complete(optImp, 1))
431419

432420

433-
# Bind PID back to dataset
434-
optImp <- bind_cols(PID, complete(optImp, 1))
421+
# # Bind PID back to dataset
422+
# optImp <- bind_cols(PID, complete(optImp, 1))
423+
424+
# Bind outcomes and PIDs back to dataset
425+
optImp <- bind_cols(outcomes, complete(optImp, 1))
426+
435427

436428
# Full clean version to have
437429
write_csv(optImp, file = 'Obstetrics_Periodontal_Therapy.csv')
@@ -440,6 +432,7 @@ write_csv(optImp, file = 'Obstetrics_Periodontal_Therapy.csv')
440432

441433

442434

435+
443436
# Check balance of factor variables for ML
444437
factor_counts <- optImp %>%
445438
dplyr::select(where(is.factor)) %>%
@@ -451,17 +444,55 @@ factor_counts <- optImp %>%
451444

452445
factor_counts
453446

447+
448+
449+
450+
454451
# Smaller more balanced version for LASSO and R
455452
optML <- optImp %>%
456-
dplyr::select(-c(Diabetes,
457-
Use.Alc,
458-
Fetal.congenital.anomaly,
459-
Any.stillbirth,
453+
dplyr::select(-c(X..Vis.Elig,
454+
Diabetes,
455+
Fetal.congenital.anomaly,
460456
Hypertension,
461457
Traumatic.Inj,
462458
BL.Bac.vag,
463-
ETXU_CAT1))
459+
ETXU_CAT1)) %>%
460+
mutate(Any.SAE.= as.factor(ifelse(Any.SAE. == 'Yes', 1, 0)),
461+
Preg.ended...37.wk = as.factor(ifelse(Preg.ended...37.wk == 'Yes', 1, 0)))
462+
463+
464+
# Upsample to get more examples of rare class output
465+
optML <- optML %>%
466+
dplyr::select(-PID)
467+
468+
optML <- upSample(x = optML[, -which(names(optML) == "Preg.ended...37.wk")], y = optML$Preg.ended...37.wk, yname = "Preg.ended...37.wk") %>%
469+
as_tibble()
470+
471+
optML <- optML %>%
472+
mutate(PID= paste0('P', 1:nrow(optML))) %>%
473+
relocate(PID, .before = Clinic)
474+
475+
476+
477+
# Sample rows to remove 20% from the upsampled class
478+
set.seed(123)
479+
480+
nclass1 <- optML %>%
481+
filter(Preg.ended...37.wk == "1")
482+
483+
down1 <- round(0.35 * nrow(nclass1))
484+
485+
PID1 <- nclass1 %>%
486+
slice_sample(n = down1) %>%
487+
pull(PID)
488+
489+
490+
# Combine with other class
491+
optML <- optML %>%
492+
dplyr::filter(!PID %in% PID1)
493+
464494

495+
# Up-sample sparse class
465496

466497
save(optML, file = 'Obt_Perio_ML.Rdata')
467498

exercises/.Rhistory

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
library(tidyverse)
2+
library(caret)
3+
library(glmnet)
4+
library(MASS)
5+
load(file = "../data/Obt_Perio_ML.Rdata")
6+
# Reshape data to long format for ggplot2
7+
long_data <- optML %>%
8+
dplyr::select(where(is.numeric)) %>%
9+
pivot_longer(cols = everything(),
10+
names_to = "variable",
11+
values_to = "value")
12+
# Plot histograms for each numeric variable in one grid
13+
ggplot(long_data, aes(x = value)) +
14+
geom_histogram(binwidth = 0.5, fill = "#9395D3", color ='grey30') +
15+
facet_wrap(~ variable, scales = "free") +
16+
theme_minimal()
17+
summary(optML)
18+
optML$GA...1st.SAE
19+
hist(optML$GA...1st.SAE)
20+
tabl3(as.factor(optML$GA...1st.SAE))
21+
table(as.factor(optML$GA...1st.SAE))
22+
hist(optML$OTNF5)
23+
hist(optML$OTNF5, breaks = 20)
24+
optML$OTNF5

exercises/exercise5B.qmd

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,23 +39,22 @@ Birthweight - Weight of baby at birth (grams)
3939
Apgar1 - Apgar score, a summary of a newborn infant's 'appearance at birth, range: 0-10
4040
Apgar5 - Apgar score at 5 minutes, numeric, range: 0-10
4141
Preg.ended...37.wk - Pregnancy ended before week 37, categorical (0 = no, 1 = yes)
42+
Any.SAE. - Whether participant experienced any serious adverse events (Yes, No)
4243
```
4344

4445
The remaining 28 variables we will consider as potential explanatory variables for these outcomes.
4546

46-
3. Do some basic summary statistics and distributional plots to get a feel for the data. Which types of variables do we have?
47+
3. Do some basic summary statistics. How many categorical variables and how many numeric variables do you have? Try to make distributional plots for a couple of your numeric variables (or all if you would like) to get a feel for some of the data distributions you have.
4748

4849
4. Make count tables for all your categorical/factor variables, are they balanced?
4950

5051
## Part 1: Elastic Net Regression
5152

5253
Elastic Net regression is part of the family of penalized regressions, which also includes Ridge regression and LASSO regression. Penalized regressions are especially useful when dealing with many predictors, as they help eliminate less informative ones while retaining the important predictors, making them ideal for high-dimensional datasets. One of the key advantages of Elastic Net over other types of penalized regression is its ability to handle multicollinearity and situations where the number of predictors exceeds the number of observations.
5354

54-
As described above we have five variables which could be considered outcomes as these where all measured at the end of pregnancy. We can only work with one outcome at a time so we have combined these into a single variable named `Outcome.Birth`.
55+
As described above we have five variables which could be considered outcomes as these where all measured at the end of pregnancy. We can only work with one outcome at a time and we will pick `Preg.ended...37.wk`. This variable is a factor variable which denotes if a women gave birth prematurely (1=yes, 0=no).
5556

56-
The variable `Outcome.Birth` represents any kind of 'critical' birth outcome, reflected by a low Apgar score, premature birth or critically low birth weight (defined as \< 1500 grams). `Outcome.Birth` is a factor variable where; 0 = no event and 1 = event.
57-
58-
5. As you will use the response `Outcome.Birth`, you should remove the original five outcome measures from your dataset.
57+
5. As you will use the response `Preg.ended...37.wk`, you should remove the other five outcome measures from your dataset.
5958

6059
6. Elastic net regression can be sensitive to large differences in the range of numeric/integer variables, as such these variables should be scaled. Scale all numeric/integer variables in your dataset.
6160

@@ -65,11 +64,11 @@ The variable `Outcome.Birth` represents any kind of 'critical' birth outcome, re
6564
mutate(across(...))
6665
:::
6766

68-
7. Split your dataset into train and test set, you should have 70% of the data in the training set and 30% in the test set. How you chose to split is up to you, BUT afterwards you should ensure that for the categorical/factor variables all levels are represented in both sets.
67+
7. Split your dataset into train and test set, you should have 75% of the data in the training set and 30% in the test set. How you chose to split is up to you, BUT afterwards you should ensure that for the categorical/factor variables all levels are represented in both sets.
6968

70-
8. After dividing into train and test set pull out the response variable `Outcome.Birth` into its own vector for both datasets, name these: `y_train` and `y_test`.
69+
8. After dividing into train and test set pull out the response variable `Preg.ended...37.wk` into its own vector for both datasets, name these: `y_train` and `y_test`.
7170

72-
9. Remove the response variable `Outcome.Birth` from the train and test set, as well as `PID` (if you have not already done so), as we should obviously not use this for training or testing.
71+
9. Remove the response variable `Preg.ended...37.wk` from the train and test set, as well as `PID` (if you have not already done so), as we should obviously not use this for training or testing.
7372

7473
You will employ the package `glmnet` to perform Elastic Net Regression. The main function from this package is `glmnet()` which we will use to fit the model. Additionally, you will also perform cross validation with `cv.glmnet()` to obtain the best value of the model hyper-parameter, lambda (λ).
7574

@@ -114,4 +113,4 @@ coeffsDat <- as.data.frame(as.matrix(coeffs))
114113

115114
16. Make a plot that shows the absolute importance of the variables retained in your model. This could be barplot with variable names on the x-axis and the height of the bars denoting absolute size of coefficient).
116115

117-
17. Make a logistic regression using this dataset (you already have your train data, test data, y_train and y_test). Do you get similar results?
116+
17. Make a logistic regression using this same dataset (you already have your train data, test data, y_train and y_test). Do you get similar results?

0 commit comments

Comments
 (0)