Code of Analysis and Research

Question 1 Code:

Load necessary libraries

library(tidyverse) library(readr) library(caret) library(randomForest)

Read and preprocess the data

data <- read_csv(“hcvdat.csv”) %>% mutate( AgeGroup = cut(Age, breaks = c(0, 30, 50, Inf), labels = c(“Young”, “Middle-aged”, “Senior”)), Sex = factor(Sex, levels = c(“m”, “f”)), ALB_log = log(ALB + 1), Age2 = Age^2 ) %>% na.omit()

Split the data into training and validation sets

set.seed(123) train_indices <- sample(1:nrow(data), 0.8 * nrow(data)) train_data <- data[train_indices, ] validation_data <- data[-train_indices, ]

Train a random forest model

fitControl <- trainControl(method = “cv”, number = 10, savePredictions = “final”) rf_model <- train( ALB_log ~ Sex + Age + ALB + ALP + ALT + AST + BIL + CHE + CHOL + CREA + GGT + PROT + Age2, data = train_data, method = “rf”, ntree = 500, trControl = fitControl, importance = TRUE )

Print the model summary

print(rf_model) varImpPlot(rf_model$finalModel, main = “Variable Importance”)

Make predictions and calculate performance metrics

predictions <- predict(rf_model, newdata = validation_data) actuals <- validation_data$ALB_log residuals <- actuals - predictions RMSE <- sqrt(mean(residuals^2)) R2 <- cor(actuals, predictions)^2

Print RMSE and R-squared

print(paste(“RMSE:”, RMSE)) print(paste(“R-squared:”, R2))

Plot residuals vs fitted values

plot_data <- data.frame(Fitted = predictions, Residuals = residuals) ggplot(plot_data, aes(x = Fitted, y = Residuals)) + geom_point() + geom_smooth(method = “lm”, se = TRUE, color = “blue”) + labs(title = “Residuals vs Fitted Values”, x = “Fitted Values”, y = “Residuals”)

Additional data manipulation and plotting

data <- data %>% mutate(AgeGroup = cut(Age, breaks = c(0, 30, 50, 70), labels = c(“0-30”, “31-50”, “51-70”)))

age_group_means <- data %>% group_by(AgeGroup) %>% summarise( Mean_ALP = mean(ALP, na.rm = TRUE), Mean_ALT = mean(ALT, na.rm = TRUE), Mean_AST = mean(AST, na.rm = TRUE) )

age_group_means %>% gather(key = “Enzyme”, value = “Mean_Level”, -AgeGroup) %>% ggplot(aes(x = AgeGroup, y = Mean_Level, fill = Enzyme)) + geom_bar(stat = “identity”, position = position_dodge()) + labs(title = “Average Liver Enzyme Levels by Age Group”, x = “Age Group”, y = “Average Enzyme Level”) + theme_minimal()

Question 2 Code:

Load necessary libraries

library(tidyverse) library(car)

Read and preprocess the data

data_viable <- read_csv(“hcvdat.csv”) %>% mutate( Category = factor(Category, levels = c(“0=Blood Donor”, “1=Hepatitis”, “2=Fibrosis”, “3=Cirrhosis”)), Sex = factor(Sex, levels = c(“m”, “f”)), AgeGroup = cut(Age, breaks = c(0, 30, 50, 75, Inf), labels = c(“Young”, “Middle-aged”, “Senior”, “Elderly”)) )

Perform Levene’s tests for homogeneity of variances

levene_test_category <- leveneTest(GGT ~ Category, data = data_viable) levene_test_sex <- leveneTest(GGT ~ Sex, data = data_viable) levene_test_agegroup <- leveneTest(GGT ~ AgeGroup, data = data_viable) levene_test_cat_sex <- leveneTest(GGT ~ Category:Sex, data = data_viable)

Print the results of the Levene’s tests

print(levene_test_category) print(levene_test_sex) print(levene_test_agegroup) print(levene_test_cat_sex)

Log-transform GGT and add to the dataset

data_viable <- data_viable %>% mutate(log_GGT = log(GGT + 1))

Perform ANOVA on log-transformed GGT with interaction terms

anova_log_ggt <- aov(log_GGT ~ Category * Sex * AgeGroup, data = data_viable) print(summary(anova_log_ggt))

Fit a simple linear model

simple_model <- lm(log_GGT ~ Category + Sex + AgeGroup, data = data_viable) print(summary(simple_model))

Create a plot showing interactions

p <- ggplot(data_viable, aes(x = Category, y = log_GGT, color = Sex, shape = AgeGroup)) + geom_point(alpha = 0.6) + geom_smooth(method = “lm”, se = FALSE, fullrange = TRUE) + facet_wrap(~AgeGroup) + labs( title = “Interaction of Category, Sex, and Age on log(GGT) Levels”, y = “Log of Gamma-Glutamyl Transferase”, x = “Category” ) + scale_color_brewer(palette = “Set1”) + theme_minimal()

Print the plot

print(p)

Question 3 Code:

Load necessary libraries

library(tidyverse) library(caret) library(e1071) library(nnet) library(pROC) library(randomForest)

Read and preprocess the data

data <- read_csv(“hcvdat.csv”) %>% mutate( Category = factor(Category, levels = c(“0=Blood Donor”, “1=Hepatitis”, “2=Fibrosis”, “3=Cirrhosis”), labels = c(“Blood_Donor”, “Hepatitis”, “Fibrosis”, “Cirrhosis”)), Sex = factor(Sex, levels = c(“m”, “f”)), log_ALB = log(ALB + 1), log_BIL = log(BIL + 1) ) %>% na.omit()

Split the data into training and test sets

set.seed(123) training_indices <- createDataPartition(data$Category, p = 0.8, list = FALSE) training_data <- data[training_indices, ] test_data <- data[-training_indices, ]

Set up cross-validation

fitControl <- trainControl(method = “cv”, number = 10, summaryFunction = multiClassSummary, classProbs = TRUE, savePredictions = “final”)

Train a Random Forest model

model_rf <- train(Category ~ log_ALB + log_BIL + AST + ALT + GGT + Sex, data = training_data, method = “rf”, trControl = fitControl, metric = “Accuracy”)

Train an SVM model

model_svm <- train(Category ~ log_ALB + log_BIL + AST + ALT + GGT + Sex, data = training_data, method = “svmRadial”, trControl = fitControl, metric = “Accuracy”)

Make predictions with Random Forest

predictions_rf <- predict(model_rf, newdata = test_data) confusionMatrix_rf <- confusionMatrix(predictions_rf, test_data$Category)

Calculate ROC curves and AUCs for Random Forest

probabilities_rf <- predict(model_rf, newdata = test_data, type = “prob”) roc_results_rf <- lapply(levels(test_data$Category), function(class) roc(response = test_data$Category, predictor = probabilities_rf[, class])) aucs_rf <- sapply(roc_results_rf, auc)

Make predictions with SVM

probabilities_svm <- predict(model_svm, newdata = test_data, type = “prob”) roc_results_svm <- lapply(levels(test_data$Category), function(class) roc(response = test_data$Category, predictor = probabilities_svm[, class])) aucs_svm <- sapply(roc_results_svm, auc)

Print AUCs

print(list(RandomForest_AUCs = aucs_rf, SVM_AUCs = aucs_svm))

Confusion matrix heatmap

confusionMatrix_rf <- confusionMatrix(predictions_rf, test_data$Category) heatmap(as.matrix(confusionMatrix_rf$table), Rowv = NA, Colv = NA, scale = “column”, margins = c(5,5), xlab = “Predicted”, ylab = “Actual”, main = “Confusion Matrix Heatmap”)

Plot ROC curves

plot(roc_results_rf[[1]], col = “red”, main = “ROC Curves for Disease Categories”) for (i in 2:length(roc_results_rf)) { plot(roc_results_rf[[i]], col = i, add = TRUE) } legend(“bottomright”, legend = levels(test_data$Category), col = 1:length(roc_results_rf), lwd = 2)

Variable Importance Plot for Random Forest

varImpPlot(model_rf$finalModel, main = “Variable Importance in Predicting Liver Disease Severity”)