Center-for-Health-Data-Science
diff --git a/‎data/.Rhistory
Lines changed: 0 additions & 512 deletions b/‎data/.Rhistory
Lines changed: 0 additions & 512 deletions
diff --git a/‎data/Obstetrics_Periodontal_Therapy.csv
Lines changed: 751 additions & 751 deletions b/‎data/Obstetrics_Periodontal_Therapy.csv
Lines changed: 751 additions & 751 deletions
diff --git a/‎data/Obt_Perio_ML.Rdata
42.3 KB b/‎data/Obt_Perio_ML.Rdata
42.3 KB
diff --git a/‎data/make_data.R
Lines changed: 69 additions & 38 deletions b/‎data/make_data.R
Lines changed: 69 additions & 38 deletions
diff --git a/‎exercises/.Rhistory
Lines changed: 24 additions & 0 deletions b/‎exercises/.Rhistory
Lines changed: 24 additions & 0 deletions
diff --git a/‎exercises/exercise5B.qmd
Lines changed: 8 additions & 9 deletions b/‎exercises/exercise5B.qmd
Lines changed: 8 additions & 9 deletions
@@ -335,26 +335,27 @@ opt <- opt %>%
 opt <- opt %>% 
   filter(!if_any(all_of(c("Apgar1", "Apgar5", "Birthweight", "Any.SAE.", 
                           "Fetal.congenital.anomaly", "Preg.ended...37.wk", "GA.at.outcome")), is.na)) %>%
-  select(-c(Prev.preg, Birth.outcome, Drug.Add, Polyhyd, Mom.HIV.status,
-            X..Vis.Elig, X1st.Miss.Vis, BL.Cortico, O1B1, O1B5, O61, O65, 
-            O81, O85, OTNF1, Oligo)) %>%
-  select(where(~sum(is.na(.)) < NAcutoff))
+  dplyr::select(-c(Prev.preg, Use.Alc, Drug.Add, Birth.outcome, Polyhyd, 
+                   Mom.HIV.status, V3.Cortico, BL.Cortico, V5.Cortico, 
+                   O1B1, O1B5, O61, O65, O81, 
+                   O85, OTNF1, OTNF5, Oligo)) %>%
+  dplyr::select(where(~sum(is.na(.)) < NAcutoff))
 
 
 # Make combined race/ethnicity variable (short integer form)
 # Vary rare race/ethnicity are removed (e.g. not enough data points)
 Race <- opt %>% 
   group_by(combin) %>% 
   summarise(each = n()) %>%
-  filter(each >= 20) %>% # The cutoff of 7 was picked based on results of summary output
-  select(-each) %>%
+  dplyr::filter(each >= 20) %>% # The cutoff of 7 was picked based on results of summary output
+  dplyr::select(-each) %>%
   mutate(Race = as.character(1:nrow(.)))
 
 
 # Joining new short race/ethnicity variable with full dataset, remove redundant columns.
 opt <- left_join(opt, Race) %>%
-  filter(!is.na(Race)) %>%
-  select(-c(combin, Black:Hisp)) %>%
+  dplyr::filter(!is.na(Race)) %>%
+  dplyr::select(-c(combin, Black:Hisp)) %>%
   relocate(Race, .after = Age) 
 
 
@@ -370,40 +371,27 @@ opt <- opt %>%
   mutate(N.PAL.sites = as.factor(ifelse(N.PAL.sites >= 2 , "3-33", as.character(N.PAL.sites)))) 
 
 
+outvars <- c("PID", "Apgar1", "Apgar5", "Birthweight", "GA.at.outcome", "Any.SAE.", "Preg.ended...37.wk")
 
+outcomes <- opt %>% 
+  dplyr::select(outvars)
 
-# Remove ID (should not be used for imputation)
-#Outcomes <- c(
-#  "PID", 
-#  "Apgar1", 
-#  "Apgar5",
-#  "Any.SAE.",
-#  "Birthweight", 
-#  "Fetal.congenital.anomaly", 
-#  "Preg.ended...37.wk", 
-#  "GA...1st.SAE",
-#  "GA.at.outcome")
-
-#optOut <- opt %>% 
-#  select(all_of(Outcomes))
-
-#opt <- opt %>% 
-#  select(-all_of(Outcomes))
+opt <- opt %>% 
+  dplyr::select(-outvars)
 
-PID <- opt %>% 
-  select(PID)
 
 
 # Pattern of missingness
-md.pattern(opt[,-1], rotate.names = TRUE)
+md.pattern(opt, rotate.names = TRUE)
 
 # Check the methods used for imputing each variable
-init <-  mice(opt[,-1], maxit=0) 
+init <-  mice(opt, maxit=0)
 meth <-  init$method
 meth
 
-# Impute missing values
-optImp <- mice(opt[,-1], maxit=10, method = meth, seed = 1234) 
+# Impute missing values - AND YES I KNOW I AM USING THE OUTCOME VARIABLES AS WELL, BAD BUT I NEED THE
+optImp <- mice(opt, maxit=10, method = meth, seed = 1234)
+
 
 
 
@@ -430,8 +418,12 @@ stripplot(optImp, OCRP5, col=c("grey",mdc(2)),pch=c(1,20))
 #optImp <- bind_cols(optOut, complete(optImp, 1))
 
 
-# Bind PID back to dataset
-optImp <- bind_cols(PID, complete(optImp, 1))
+# # Bind PID back to dataset
+# optImp <- bind_cols(PID, complete(optImp, 1))
+
+# Bind outcomes and PIDs  back to dataset
+optImp <- bind_cols(outcomes, complete(optImp, 1))
+
 
 # Full clean version to have
 write_csv(optImp, file = 'Obstetrics_Periodontal_Therapy.csv')
@@ -440,6 +432,7 @@ write_csv(optImp, file = 'Obstetrics_Periodontal_Therapy.csv')
 
 
 
+
 # Check balance of factor variables for ML
 factor_counts <- optImp  %>%
   dplyr::select(where(is.factor)) %>%
@@ -451,17 +444,55 @@ factor_counts <- optImp  %>%
 
 factor_counts
 
+
+
+
+
 # Smaller more balanced version for LASSO and R
 optML <- optImp %>% 
-  dplyr::select(-c(Diabetes, 
-                   Use.Alc,
-                   Fetal.congenital.anomaly,  
-                   Any.stillbirth, 
+  dplyr::select(-c(X..Vis.Elig,
+                   Diabetes,
+                   Fetal.congenital.anomaly,
                    Hypertension,
                    Traumatic.Inj,
                    BL.Bac.vag,
-                   ETXU_CAT1)) 
+                   ETXU_CAT1)) %>%
+  mutate(Any.SAE.= as.factor(ifelse(Any.SAE. == 'Yes', 1, 0)), 
+         Preg.ended...37.wk = as.factor(ifelse(Preg.ended...37.wk == 'Yes', 1, 0)))
+
+
+# Upsample to get more examples of rare class output
+optML <- optML %>% 
+  dplyr::select(-PID)
+
+optML <- upSample(x = optML[, -which(names(optML) == "Preg.ended...37.wk")], y = optML$Preg.ended...37.wk, yname = "Preg.ended...37.wk") %>% 
+  as_tibble()
+
+optML <- optML %>% 
+  mutate(PID= paste0('P', 1:nrow(optML))) %>% 
+  relocate(PID, .before = Clinic)
+
+
+
+# Sample rows to remove 20% from the upsampled class
+set.seed(123)
+
+nclass1 <- optML %>% 
+  filter(Preg.ended...37.wk == "1")
+
+down1 <- round(0.35 * nrow(nclass1))
+
+PID1 <- nclass1 %>% 
+  slice_sample(n = down1) %>%
+  pull(PID)
+
+
+# Combine with other class
+optML <- optML %>% 
+  dplyr::filter(!PID %in% PID1)
+
 
+# Up-sample sparse class
 
 save(optML, file = 'Obt_Perio_ML.Rdata')
 
 
@@ -0,0 +1,24 @@
+library(tidyverse)
+library(caret)
+library(glmnet)
+library(MASS)
+load(file = "../data/Obt_Perio_ML.Rdata")
+# Reshape data to long format for ggplot2
+long_data <- optML %>%
+dplyr::select(where(is.numeric)) %>%
+pivot_longer(cols = everything(),
+names_to = "variable",
+values_to = "value")
+# Plot histograms for each numeric variable in one grid
+ggplot(long_data, aes(x = value)) +
+geom_histogram(binwidth = 0.5, fill = "#9395D3", color ='grey30') +
+facet_wrap(~ variable, scales = "free") +
+theme_minimal()
+summary(optML)
+optML$GA...1st.SAE
+hist(optML$GA...1st.SAE)
+tabl3(as.factor(optML$GA...1st.SAE))
+table(as.factor(optML$GA...1st.SAE))
+hist(optML$OTNF5)
+hist(optML$OTNF5, breaks = 20)
+optML$OTNF5
@@ -39,23 +39,22 @@ Birthweight - Weight of baby at birth (grams)
 Apgar1 - Apgar score, a summary of a newborn infant's 'appearance at birth, range: 0-10 
 Apgar5 - Apgar score at 5 minutes, numeric, range: 0-10
 Preg.ended...37.wk - Pregnancy ended before week 37, categorical (0 = no, 1 = yes)
+Any.SAE. - Whether participant experienced any serious adverse events (Yes, No)
 ```
 
 The remaining 28 variables we will consider as potential explanatory variables for these outcomes.
 
-3.  Do some basic summary statistics and distributional plots to get a feel for the data. Which types of variables do we have?
+3.  Do some basic summary statistics. How many categorical variables and how many numeric variables do you have? Try to make distributional plots for a couple of your numeric variables (or all if you would like) to get a feel for some of the data distributions you have.
 
 4.  Make count tables for all your categorical/factor variables, are they balanced?
 
 ## Part 1: Elastic Net Regression
 
 Elastic Net regression is part of the family of penalized regressions, which also includes Ridge regression and LASSO regression. Penalized regressions are especially useful when dealing with many predictors, as they help eliminate less informative ones while retaining the important predictors, making them ideal for high-dimensional datasets. One of the key advantages of Elastic Net over other types of penalized regression is its ability to handle multicollinearity and situations where the number of predictors exceeds the number of observations.
 
-As described above we have five variables which could be considered outcomes as these where all measured at the end of pregnancy. We can only work with one outcome at a time so we have combined these into a single variable named `Outcome.Birth`.
+As described above we have five variables which could be considered outcomes as these where all measured at the end of pregnancy. We can only work with one outcome at a time and we will pick `Preg.ended...37.wk`. This variable is a factor variable which denotes if a women gave birth prematurely (1=yes, 0=no).
 
-The variable `Outcome.Birth` represents any kind of 'critical' birth outcome, reflected by a low Apgar score, premature birth or critically low birth weight (defined as \< 1500 grams). `Outcome.Birth` is a factor variable where; 0 = no event and 1 = event.
-
-5.  As you will use the response `Outcome.Birth`, you should remove the original five outcome measures from your dataset.
+5.  As you will use the response `Preg.ended...37.wk`, you should remove the other five outcome measures from your dataset.
 
 6.  Elastic net regression can be sensitive to large differences in the range of numeric/integer variables, as such these variables should be scaled. Scale all numeric/integer variables in your dataset.
 
@@ -65,11 +64,11 @@ The variable `Outcome.Birth` represents any kind of 'critical' birth outcome, re
 mutate(across(...))
 :::
 
-7.  Split your dataset into train and test set, you should have 70% of the data in the training set and 30% in the test set. How you chose to split is up to you, BUT afterwards you should ensure that for the categorical/factor variables all levels are represented in both sets.
+7.  Split your dataset into train and test set, you should have 75% of the data in the training set and 30% in the test set. How you chose to split is up to you, BUT afterwards you should ensure that for the categorical/factor variables all levels are represented in both sets.
 
-8.  After dividing into train and test set pull out the response variable `Outcome.Birth` into its own vector for both datasets, name these: `y_train` and `y_test`.
+8.  After dividing into train and test set pull out the response variable `Preg.ended...37.wk` into its own vector for both datasets, name these: `y_train` and `y_test`.
 
-9.  Remove the response variable `Outcome.Birth` from the train and test set, as well as `PID` (if you have not already done so), as we should obviously not use this for training or testing.
+9.  Remove the response variable `Preg.ended...37.wk` from the train and test set, as well as `PID` (if you have not already done so), as we should obviously not use this for training or testing.
 
 You will employ the package `glmnet` to perform Elastic Net Regression. The main function from this package is `glmnet()` which we will use to fit the model. Additionally, you will also perform cross validation with `cv.glmnet()` to obtain the best value of the model hyper-parameter, lambda (λ).
 
@@ -114,4 +113,4 @@ coeffsDat <- as.data.frame(as.matrix(coeffs))
 
 16. Make a plot that shows the absolute importance of the variables retained in your model. This could be barplot with variable names on the x-axis and the height of the bars denoting absolute size of coefficient).
 
-17. Make a logistic regression using this dataset (you already have your train data, test data, y_train and y_test). Do you get similar results?
+17. Make a logistic regression using this same dataset (you already have your train data, test data, y_train and y_test). Do you get similar results?