Code of Analysis and Research
Question 1 Code:
Load necessary libraries
library(tidyverse) library(readr) library(caret) library(randomForest)
Read and preprocess the data
data <- read_csv(“hcvdat.csv”) %>% mutate( AgeGroup = cut(Age, breaks = c(0, 30, 50, Inf), labels = c(“Young”, “Middle-aged”, “Senior”)), Sex = factor(Sex, levels = c(“m”, “f”)), ALB_log = log(ALB + 1), Age2 = Age^2 ) %>% na.omit()
Split the data into training and validation sets
set.seed(123) train_indices <- sample(1:nrow(data), 0.8 * nrow(data)) train_data <- data[train_indices, ] validation_data <- data[-train_indices, ]
Train a random forest model
fitControl <- trainControl(method = “cv”, number = 10, savePredictions = “final”) rf_model <- train( ALB_log ~ Sex + Age + ALB + ALP + ALT + AST + BIL + CHE + CHOL + CREA + GGT + PROT + Age2, data = train_data, method = “rf”, ntree = 500, trControl = fitControl, importance = TRUE )
Print the model summary
print(rf_model) varImpPlot(rf_model$finalModel, main = “Variable Importance”)
Make predictions and calculate performance metrics
predictions <- predict(rf_model, newdata = validation_data) actuals <- validation_data$ALB_log residuals <- actuals - predictions RMSE <- sqrt(mean(residuals^2)) R2 <- cor(actuals, predictions)^2
Print RMSE and R-squared
print(paste(“RMSE:”, RMSE)) print(paste(“R-squared:”, R2))
Plot residuals vs fitted values
plot_data <- data.frame(Fitted = predictions, Residuals = residuals) ggplot(plot_data, aes(x = Fitted, y = Residuals)) + geom_point() + geom_smooth(method = “lm”, se = TRUE, color = “blue”) + labs(title = “Residuals vs Fitted Values”, x = “Fitted Values”, y = “Residuals”)
Additional data manipulation and plotting
data <- data %>% mutate(AgeGroup = cut(Age, breaks = c(0, 30, 50, 70), labels = c(“0-30”, “31-50”, “51-70”)))
age_group_means <- data %>% group_by(AgeGroup) %>% summarise( Mean_ALP = mean(ALP, na.rm = TRUE), Mean_ALT = mean(ALT, na.rm = TRUE), Mean_AST = mean(AST, na.rm = TRUE) )
age_group_means %>% gather(key = “Enzyme”, value = “Mean_Level”, -AgeGroup) %>% ggplot(aes(x = AgeGroup, y = Mean_Level, fill = Enzyme)) + geom_bar(stat = “identity”, position = position_dodge()) + labs(title = “Average Liver Enzyme Levels by Age Group”, x = “Age Group”, y = “Average Enzyme Level”) + theme_minimal()
Question 2 Code:
Load necessary libraries
library(tidyverse) library(car)
Read and preprocess the data
data_viable <- read_csv(“hcvdat.csv”) %>% mutate( Category = factor(Category, levels = c(“0=Blood Donor”, “1=Hepatitis”, “2=Fibrosis”, “3=Cirrhosis”)), Sex = factor(Sex, levels = c(“m”, “f”)), AgeGroup = cut(Age, breaks = c(0, 30, 50, 75, Inf), labels = c(“Young”, “Middle-aged”, “Senior”, “Elderly”)) )
Perform Levene’s tests for homogeneity of variances
levene_test_category <- leveneTest(GGT ~ Category, data = data_viable) levene_test_sex <- leveneTest(GGT ~ Sex, data = data_viable) levene_test_agegroup <- leveneTest(GGT ~ AgeGroup, data = data_viable) levene_test_cat_sex <- leveneTest(GGT ~ Category:Sex, data = data_viable)
Print the results of the Levene’s tests
print(levene_test_category) print(levene_test_sex) print(levene_test_agegroup) print(levene_test_cat_sex)
Log-transform GGT and add to the dataset
data_viable <- data_viable %>% mutate(log_GGT = log(GGT + 1))
Perform ANOVA on log-transformed GGT with interaction terms
anova_log_ggt <- aov(log_GGT ~ Category * Sex * AgeGroup, data = data_viable) print(summary(anova_log_ggt))
Fit a simple linear model
simple_model <- lm(log_GGT ~ Category + Sex + AgeGroup, data = data_viable) print(summary(simple_model))
Create a plot showing interactions
p <- ggplot(data_viable, aes(x = Category, y = log_GGT, color = Sex, shape = AgeGroup)) + geom_point(alpha = 0.6) + geom_smooth(method = “lm”, se = FALSE, fullrange = TRUE) + facet_wrap(~AgeGroup) + labs( title = “Interaction of Category, Sex, and Age on log(GGT) Levels”, y = “Log of Gamma-Glutamyl Transferase”, x = “Category” ) + scale_color_brewer(palette = “Set1”) + theme_minimal()
Print the plot
print(p)
Question 3 Code:
Load necessary libraries
library(tidyverse) library(caret) library(e1071) library(nnet) library(pROC) library(randomForest)
Read and preprocess the data
data <- read_csv(“hcvdat.csv”) %>% mutate( Category = factor(Category, levels = c(“0=Blood Donor”, “1=Hepatitis”, “2=Fibrosis”, “3=Cirrhosis”), labels = c(“Blood_Donor”, “Hepatitis”, “Fibrosis”, “Cirrhosis”)), Sex = factor(Sex, levels = c(“m”, “f”)), log_ALB = log(ALB + 1), log_BIL = log(BIL + 1) ) %>% na.omit()
Split the data into training and test sets
set.seed(123) training_indices <- createDataPartition(data$Category, p = 0.8, list = FALSE) training_data <- data[training_indices, ] test_data <- data[-training_indices, ]
Set up cross-validation
fitControl <- trainControl(method = “cv”, number = 10, summaryFunction = multiClassSummary, classProbs = TRUE, savePredictions = “final”)
Train a Random Forest model
model_rf <- train(Category ~ log_ALB + log_BIL + AST + ALT + GGT + Sex, data = training_data, method = “rf”, trControl = fitControl, metric = “Accuracy”)
Train an SVM model
model_svm <- train(Category ~ log_ALB + log_BIL + AST + ALT + GGT + Sex, data = training_data, method = “svmRadial”, trControl = fitControl, metric = “Accuracy”)
Make predictions with Random Forest
predictions_rf <- predict(model_rf, newdata = test_data) confusionMatrix_rf <- confusionMatrix(predictions_rf, test_data$Category)
Calculate ROC curves and AUCs for Random Forest
probabilities_rf <- predict(model_rf, newdata = test_data, type = “prob”) roc_results_rf <- lapply(levels(test_data\(Category), function(class) roc(response = test_data\)Category, predictor = probabilities_rf[, class])) aucs_rf <- sapply(roc_results_rf, auc)
Make predictions with SVM
probabilities_svm <- predict(model_svm, newdata = test_data, type = “prob”) roc_results_svm <- lapply(levels(test_data\(Category), function(class) roc(response = test_data\)Category, predictor = probabilities_svm[, class])) aucs_svm <- sapply(roc_results_svm, auc)
Print AUCs
print(list(RandomForest_AUCs = aucs_rf, SVM_AUCs = aucs_svm))
Confusion matrix heatmap
confusionMatrix_rf <- confusionMatrix(predictions_rf, test_data\(Category) heatmap(as.matrix(confusionMatrix_rf\)table), Rowv = NA, Colv = NA, scale = “column”, margins = c(5,5), xlab = “Predicted”, ylab = “Actual”, main = “Confusion Matrix Heatmap”)
Plot ROC curves
plot(roc_results_rf[[1]], col = “red”, main = “ROC Curves for Disease Categories”) for (i in 2:length(roc_results_rf)) { plot(roc_results_rf[[i]], col = i, add = TRUE) } legend(“bottomright”, legend = levels(test_data$Category), col = 1:length(roc_results_rf), lwd = 2)
Variable Importance Plot for Random Forest
varImpPlot(model_rf$finalModel, main = “Variable Importance in Predicting Liver Disease Severity”)