Center-for-Health-Data-Science
diff --git a/‎data/Obt_Perio_ML.Rdata
-8.59 KB b/‎data/Obt_Perio_ML.Rdata
-8.59 KB
diff --git a/‎data/make_data.R
Lines changed: 19 additions & 15 deletions b/‎data/make_data.R
Lines changed: 19 additions & 15 deletions
diff --git a/‎exercises/ExtraExercise5.qmd
Lines changed: 76 additions & 0 deletions b/‎exercises/ExtraExercise5.qmd
Lines changed: 76 additions & 0 deletions
diff --git a/‎exercises/exercise5A.qmd
Lines changed: 19 additions & 29 deletions b/‎exercises/exercise5A.qmd
Lines changed: 19 additions & 29 deletions
diff --git a/‎exercises/exercise5B.qmd
Lines changed: 82 additions & 3 deletions b/‎exercises/exercise5B.qmd
Lines changed: 82 additions & 3 deletions
@@ -371,8 +371,11 @@ opt <- opt %>%
   mutate(N.PAL.sites = as.factor(ifelse(N.PAL.sites >= 2 , "3-33", as.character(N.PAL.sites)))) 
 
 
+
 outvars <- c("PID", "Apgar1", "Apgar5", "Birthweight", "GA.at.outcome", "Any.SAE.", "Preg.ended...37.wk")
 
+
+
 outcomes <- opt %>% 
   dplyr::select(outvars)
 
@@ -431,34 +434,34 @@ write_csv(optImp, file = 'Obstetrics_Periodontal_Therapy.csv')
 
 
 
+# ------------------------------------------------------------------------
+#  Balanced version for LASSO and RF
 
 
-# Check balance of factor variables for ML
-factor_counts <- optImp  %>%
-  dplyr::select(where(is.factor)) %>%
-  map(~ as.data.frame(table(.))) %>%
-  imap(~ setNames(.x, c("Level", "Count")) %>% mutate(Variable = .y)) %>%
-  bind_rows() %>%
-  relocate(Variable, .before = Level)
 
+# Check balance of factor variables for ML
+factor_counts <- optImp %>%
+  dplyr::select(where(is.character)) %>%
+  pivot_longer(everything(), names_to = "Variable", values_to = "Level") %>%
+  count(Variable, Level, name = "Count")
 
 factor_counts
 
 
 
 
-
 # Smaller more balanced version for LASSO and R
 optML <- optImp %>% 
   dplyr::select(-c(X..Vis.Elig,
+                   GA...1st.SAE,
                    Diabetes,
                    Fetal.congenital.anomaly,
                    Hypertension,
                    Traumatic.Inj,
                    BL.Bac.vag,
                    ETXU_CAT1)) %>%
-  mutate(Any.SAE.= as.factor(ifelse(Any.SAE. == 'Yes', 1, 0)), 
-         Preg.ended...37.wk = as.factor(ifelse(Preg.ended...37.wk == 'Yes', 1, 0)))
+  mutate(Preg.ended...37.wk = as.factor(ifelse(Preg.ended...37.wk == 'Yes', 1, 0)))
+
 
 
 # Upsample to get more examples of rare class output
@@ -470,26 +473,27 @@ optML <- upSample(x = optML[, -which(names(optML) == "Preg.ended...37.wk")], y =
 
 optML <- optML %>% 
   mutate(PID= paste0('P', 1:nrow(optML))) %>% 
-  relocate(PID, .before = Clinic)
+  relocate(PID, .before = Apgar1)
 
 
 
-# Sample rows to remove 20% from the upsampled class
+# Sample rows to remove 50% from the upsampled class
 set.seed(123)
 
 nclass1 <- optML %>% 
   filter(Preg.ended...37.wk == "1")
 
-down1 <- round(0.35 * nrow(nclass1))
+down1 <- round(0.5 * nrow(nclass1))
 
 PID1 <- nclass1 %>% 
   slice_sample(n = down1) %>%
   pull(PID)
 
 
-# Combine with other class
+# Filter class 1
 optML <- optML %>% 
-  dplyr::filter(!PID %in% PID1)
+  dplyr::filter(!PID %in% PID1) %>%
+  sample_frac(1) # shuffle_rows
 
 
 # Up-sample sparse class
 
@@ -0,0 +1,76 @@
+---
+  title: "Extra Exercise 5 - Models and Model Evaluation in R"
+format: html
+project:
+  type: website
+output-dir: ../docs
+---
+  
+## Extra exercises
+  
+e1. Find the best single predictor in the Diabetes dataset. This is done by comparing the null model (no predictors) to all possible models with one predictor, i.e. `outcome ~ predictor`, `outcome ~ predictor2`, ect. The null model can be formulated like so: `outcome ~ 1` (only the intercept). Fit all possible one predictor models and compare their fit to the null model with a likelihood ratio test. Find the predictor with the lowest p-value in the likelihood ratio test. This can be done in a loop in order to avoid writing out all models.
+
+::: {.callout-tip collapse="true"}
+## Hint
+
+To use a formula with a variable you will need to combine the literal part and the variable with paste, e.g. `paste("Outcome ~", my_pred)`.
+:::
+  
+```{r}
+
+# Define the null model (intercept-only model)
+null_model <- glm(Diabetes ~ 1, data = train, family = binomial)
+
+# Get predictor names (excluding the outcome variable)
+predictors <- setdiff(names(diabetes_nona), c("Diabetes", "ID"))
+
+# Initialize an empty data frame to store results
+results <- data.frame(Predictor = character(), ChiSq = numeric(), P_Value = numeric(), stringsAsFactors = FALSE)
+
+# Loop through each predictor and fit a logistic regression model
+for (pred in predictors) {
+  
+  # Fit model with single predictor
+  model_pred <- glm(paste("Diabetes ~", pred), data = train, family = binomial)
+  
+  # Perform Likelihood Ratio Test
+  test_result <- anova(null_model, model_pred, test = "Chisq")
+  
+  # Extract Chi-square statistic and p-value
+  chi_sq <- test_result$Deviance[2]  # The second row corresponds to the predictor model
+  p_value <- test_result$`Pr(>Chi)`[2]
+  
+  # Store results
+  results <- rbind(results, data.frame(Predictor = pred, ChiSq = chi_sq, P_Value = p_value))
+}
+
+# Print the results sorted by p-value
+results <- results %>% arrange(P_Value)
+print(results)
+
+```
+
+e2. Write a function that handles visualization of k-means clustering results. Think about which information you need to pass and what it should return.
+
+---
+
+## Quarto
+
+Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see <https://quarto.org>.
+
+## Running Code
+
+When you click the **Render** button a document will be generated that includes both content and the output of embedded code. You can embed code like this:
+
+```{r}
+1 + 1
+```
+
+You can add options to executable code like this
+
+```{r}
+#| echo: false
+2 * 2
+```
+
+The `echo: false` option disables the printing of code (only output is displayed).
@@ -15,6 +15,8 @@ In this exercise you will fit and interpret simple models.
 ```{r warning=FALSE, message=FALSE}
 library(tidyverse)
 library(readxl)
+library(ggfortify)
+library(factoextra)
 ```
 
 ## Part 1: Linear regression
@@ -66,7 +68,7 @@ plot(model)
 
 9.  Now, use our test set to predict the response `medv` (`median value per house in 1000s`).
 
-10. Evaluate how well our model performs. There are different ways of doing this but lets use the classic measure of RMSE (Root Mean Square Error). The psedocode below shows how to calculate the RMSE. A small RMSE (close to zero), indicates a good model.
+10. Evaluate how well our model performs. There are different ways of doing this but lets use the classic measure of RMSE (Root Mean Square Error). The psedo-code below shows how to calculate the RMSE. A small RMSE (close to zero), indicates a good model.
 
 ```{r, eval=FALSE}
 #RMSE
@@ -83,62 +85,50 @@ Plot `y_test` against `y_pred`.
 
 ## Part 2: Logistic regression
 
-For this part we will use the joined diabetes, so lets load the joined dataset we created in exercise 1, e.g. 'diabetes_join.xlsx' or what you have named it.
+For this part we will use the joined diabetes, so lets load the joined dataset we created in exercise 1, e.g. `diabetes_join.xlsx` or what you have named it.
 
 As the outcome we are studying, `Diabetes`, is categorical variable we will perform logistic regression. We select serum calcium levels (`Serum_ca2`), `BMI` and smoking habits (`Smoker`) as predictive variables.
 
-12. Logistic regression does not allow for any missing values so first ensure you do not have NAs in your dataframe. Ensure that your outcome variable `Diabetes` is a factor.
+12. Read in the Diabetes dataset.
 
-13. Split your data into training and test data. Take care that the two classes of the outcome variable are represented in both training and test data, and at similar ratios.
+13. Logistic regression does not allow for any missing values so first ensure you do not have NAs in your dataframe. Ensure that your outcome variable `Diabetes` is a factor.
 
-14. Fit a logistic regression model with `Serum_ca2`, `BMI` and `Smoker` as predictors and `Diabetes` as outcome, using your training data.
+14. Split your data into training and test data. Take care that the two classes of the outcome variable are represented in both training and test data, and at similar ratios.
+
+15. Fit a logistic regression model with `Serum_ca2`, `BMI` and `Smoker` as predictors and `Diabetes` as outcome, using your training data.
 
 ::: {.callout-tip collapse="true"}
 ## Hint
 
 glm(..., family = 'binomial')
 :::
 
-15. Check the model summary and try to determine whether you could potentially drop one of your variables? If so, remake your model and check the coefficients, and error terms again.
+16. Check the model summary and try to determine whether you could potentially drop one or more of your variables? If so, make this alternative model (model2) and compare it to the original model. Is there a significant loss/gain, i.e. better fit when including the serum calcium levels as predictor?
 
-16. Now, use your model to predict Diabetes class based on your test set. What does the output of the prediction mean?
+17. Now, use your model to predict Diabetes class based on your test set. What does the output of the prediction mean?
 
 ::: {.callout-tip collapse="true"}
 ## Hint
 
 `predict(... , type ='response')`
 :::
 
-17. Lets evaluate the performance of our model. As we are performing classification, measures such as mse/rmse will not work, instead we will calculate the Accuracy. In order to get the Accuracy you must first convert our predictions into Diabetes class (e.g. 0 or 1).
+18. Lets evaluate the performance of our model. As we are performing classification, measures such as mse/rmse will not work, instead we will calculate the accuracy. In order to get the accuracy you must first convert our predictions into Diabetes class labels (e.g. 0 or 1).
 
 ```{r, eval=FALSE}
-confusionMatrix(y_pred, y_test)
+caret::confusionMatrix(y_pred, y_test)
 ```
 
 ## Part 3: Clustering
 
-In this part we will run clustering on the joined diabetes dataset from exercise 1. Load it here if you don't have it already from Part 2.
-
-14. Run the k-means clustering algorithm with 4 centers on the data. Consider which columns you can use and if you have to manipulate them before. If you get an error, check whether you have values that might not be admissible, such as NA.
-
-15. Check whether the data you have run k-means on has the same number of rows as the dataframe with meta information, e.g. whether the person had diabetes. If they are not aligned, create a dataframe with Diabetes info that matches the dataframe you ran clustering on.
-
-16. Visualize the results of your clustering.
-
-17. Investigate the best number of clusters.
+In this part we will run clustering on the joined diabetes dataset (`diabetes_join.xlsx`) from exercise 1. Load it here if you don't have it already from Part 2 above.
 
-18. Re-do the clustering (plus visualization) with that number.
+19. Before running K-means clustering. Remove any missing values across all variables in your dataset.
 
-------------------------------------------------------------------------
+20. Run the k-means clustering algorithm with 4 centers on the data. Consider which columns you can use and if you have to do anything to them before clustering?
 
-## Extra exercises
+21. Visualize the results of your clustering.
 
-e1. Find the best single predictor in the Diabetes dataset. This is done by comparing the null model (no predictors) to all possible models with one predictor, i.e. `outcome ~ predictor`, `outcome ~ predictor2`, ect. The null model can be formulated like so: `outcome ~ 1` (only the intercept). Fit all possible one predictor models and compare their fit to the null model with a likelihood ratio test. Find the predictor with the lowest p-value in the likelihood ratio test. This can be done in a loop in order to avoid writing out all models.
-
-::: {.callout-tip collapse="true"}
-## Hint
-
-To use a formula with a variable you will need to combine the literal part and the variable with paste, e.g. `paste("Outcome ~", my_pred)`.
-:::
+22. Investigate the best number of clusters.
 
-e2. Write a function that handles visualization of k-means clustering results. Think about which information you need to pass and what it should return.
+23. Re-do the clustering (plus visualization) with that number.
@@ -52,7 +52,7 @@ The remaining 28 variables we will consider as potential explanatory variables f
 
 Elastic Net regression is part of the family of penalized regressions, which also includes Ridge regression and LASSO regression. Penalized regressions are especially useful when dealing with many predictors, as they help eliminate less informative ones while retaining the important predictors, making them ideal for high-dimensional datasets. One of the key advantages of Elastic Net over other types of penalized regression is its ability to handle multicollinearity and situations where the number of predictors exceeds the number of observations.
 
-As described above we have five variables which could be considered outcomes as these where all measured at the end of pregnancy. We can only work with one outcome at a time and we will pick `Preg.ended...37.wk`. This variable is a factor variable which denotes if a women gave birth prematurely (1=yes, 0=no).
+As described above we have five variables which could be considered outcomes as these where all measured at the end of pregnancy. We can only work with one outcome at a time and we will pick `Preg.ended...37.wk` for now. This variable is a factor variable which denotes if a women gave birth prematurely (1=yes, 0=no).
 
 5.  As you will use the response `Preg.ended...37.wk`, you should remove the other five outcome measures from your dataset.
 
@@ -99,7 +99,12 @@ Now, lets see how well your model performed.
 
 14. Predict if a individual is likely to give birth before the 37th week using your model and your test set. See pseudo-code below.
 
-15. Just like for the logistic regression model you can calculate the accuracy of the prediction by first converting the predicted probabilities back into class labels (0, 1) and then comparing these to `y_test` with `confusionMatrix()`. Do you have a good accuracy? N.B look at the 2x2 contingency table, what does it tell you?
+```{r, eval = FALSE}
+y_pred <- predict(model, test, type = 'class')
+
+```
+
+15. Just like for the logistic regression model you can calculate the accuracy of the prediction by comparing it to `y_test` with `confusionMatrix()`. Do you have a good accuracy? N.B look at the 2x2 contingency table, what does it tell you?
 
 16. Lastly, lets extract the variables which were retained in the model (e.g. not penalized out). We do this by calling the coefficient with `coef()` on our model. See pseudo-code below.
 
@@ -113,4 +118,78 @@ coeffsDat <- as.data.frame(as.matrix(coeffs))
 
 16. Make a plot that shows the absolute importance of the variables retained in your model. This could be barplot with variable names on the x-axis and the height of the bars denoting absolute size of coefficient).
 
-17. Make a logistic regression using this same dataset (you already have your train data, test data, y_train and y_test). Do you get similar results?
+## Part 2: Random Forest
+
+Now, we will make a Random Forest.
+
+We will continue using the `Obt_Perio_ML.Rdata` with `Preg.ended...37.wk` as outcome.
+
+18. Just like in the section on EN above:
+
+    -   Load the dataset (if you have not already)
+
+    -   Remove the outcome variables you will not be using.
+
+    -   Split the dataset into test and train set - this time keep the outcome variable `Preg.ended...37.wk` in the dataset.
+
+    -   Remember to remove the `PID` column before training!
+
+19. Set up a Random Forest model with cross-validation. See pseudo-code below. Remember to set a seed.
+
+First the cross-validation parameters:
+
+```{r, , eval=FALSE}
+set.seed(123)
+
+# Set up cross-validation: 5-fold CV
+RFcv <- trainControl(
+  method = "cv",
+  number = 5,
+  classProbs = TRUE,
+  summaryFunction = twoClassSummary,
+  savePredictions = "final"
+)
+```
+
+Next we train the model:
+
+```{r, eval=FALSE}
+# Train Random Forest
+set.seed(123)
+rf_model <- train(
+  Outcome ~ .,
+  data = Trainingdata,
+  method = "rf",
+  trControl = RFcv,
+  metric = "ROC",
+  tuneLength = 5           
+)
+
+
+# Model summary
+print(rf_model)
+```
+
+20. Plot your model fit. How does your model improve when you add 10, 20, 30, etc. predictors?
+
+```{r, eval=FALSE}
+# Best parameters
+rf_model$bestTune
+
+# Plot performance
+plot(rf_model)
+
+
+```
+
+21. Use your test set to evaluate your model performance. How does the random forest compare to the elastic net regression?
+
+22. Extract the predictive variables with the greatest importance from your fit.
+
+```{r, eval=FALSE}
+varImpOut <- varImp(rf_model)
+
+varImpOut$importance
+```
+
+23. Make a logistic regression using the same dataset (you already have your train data, test data, y_train and y_test). How do the results of Elastic Net regression and Random Forest compare to the output of your glm.