2-DB_Backup_Failure.r
# -----------------------------------------------------------------------------------# File Name : 2-DB_Backup_Failure.r# Author : Pierre Montbleau# Description : Database Backup Failure Analysis and Predictive Modeling Script# Purpose : Analyze backup success/failure patterns, build predictive models,# and generate comprehensive reports with recommendations# Call Syntax : source("F:\\DBA\\Scripts\\R\\2-DB_Backup_Failure.r")# Parameters : None - all parameters configured internally (output format auto-detected)# Last Modified: 2024-01-15# -----------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------# Load required packages with version checks and quiet loading# -----------------------------------------------------------------------------------required_packages <- c("DBI", "odbc", "dplyr", "lubridate", "ggplot2", "caret", "xgboost", "tidyr", "isotree", "rmarkdown", "knitr", "gridExtra")invisible(lapply(required_packages, function(pkg) { if (!require(pkg, character.only = TRUE, quietly = TRUE)) { install.packages(pkg, quiet = TRUE) library(pkg, character.only = TRUE) }}))
# Install tinytex for PDF generation if not available# install.packages('tinytex')# tinytex::install_tinytex()
# -----------------------------------------------------------------------------------# Function to check Pandoc availability with fallback# -----------------------------------------------------------------------------------check_pandoc <- function() { if (!rmarkdown::pandoc_available()) { message("Pandoc not found - attempting to install...") tryCatch({ rmarkdown::install_pandoc() if (!rmarkdown::pandoc_available()) { message("PDF generation unavailable - falling back to HTML") return("html") } return("pdf") }, error = function(e) { message("Could not install Pandoc: ", e$message) message("Falling back to HTML output") return("html") }) } else { return("pdf") }}
# -----------------------------------------------------------------------------------# Create a unique filename for the report with full path# -----------------------------------------------------------------------------------generate_report_filename <- function(output_format) { # Define the fixed output directory output_dir <- "F:\\DBA\\Scripts\\R\\Reports" # Create directory if it doesn't exist (with recursive = TRUE to create full path) if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE) } timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S") filename <- paste0("Backup_Analysis_Report_", timestamp, ".", output_format) # Return full path (using file.path for cross-platform compatibility) file.path(output_dir, filename)}
# -----------------------------------------------------------------------------------# Enhanced database connection with validation# -----------------------------------------------------------------------------------connect_to_database <- function() { tryCatch({ conn <- dbConnect(odbc::odbc(), dsn = "[DB_NAME]", uid = "[USERNAME]", pwd = "[PASSWORD]") timeout = 10) message("Successfully connected to database") # Verify connection works with a simple query test_query <- "SELECT 1 FROM DUAL" test_result <- dbGetQuery(conn, test_query) if (nrow(test_result) != 1) { stop("Test query failed - connection validation unsuccessful") } return(conn) }, error = function(e) { stop("Database connection failed: ", e$message) })}
# -----------------------------------------------------------------------------------# Robust query function with schema and data validation# -----------------------------------------------------------------------------------query_data <- function(conn, query) { tryCatch({ result <- dbGetQuery(conn, query) if (nrow(result) == 0) { warning("Query returned empty result: ", substr(query, 1, 30), "...") # Return empty dataframe with correct structure col_info <- tryCatch( dbColumnInfo(dbSendQuery(conn, query)), error = function(e) NULL ) if (is.null(col_info)) { return(data.frame()) } return(data.frame(matrix(ncol = nrow(col_info), nrow = 0))) } message("Successfully queried ", nrow(result), " rows from: ", substr(query, 1, 30), "...") return(result) }, error = function(e) { warning("Query '", substr(query, 1, 30), "...' failed: ", e$message) return(NULL) })}
# -----------------------------------------------------------------------------------# Enhanced preprocessing with muted warning for no failures# -----------------------------------------------------------------------------------preprocess_backup_data <- function(backup_data) { if (is.null(backup_data)) { stop("Backup data is NULL - check your database connection and query") } if (nrow(backup_data) == 0) { stop("Backup data is empty - no records returned from query") } required_cols <- c("DB_NAME", "BACKUP_TYPE", "COMPLETION_TIME", "CHECK_DATE") missing_cols <- setdiff(required_cols, colnames(backup_data)) if (length(missing_cols) > 0) { stop("Missing required columns: ", paste(missing_cols, collapse = ", ")) } # Extended failure patterns failure_patterns <- c("FAILED", "ERROR", "ABORTED", "STOPPED", "UNSUCCESSFUL", "PARTIAL", "TIMEOUT") success_patterns <- c("COMPLETED", "SUCCESS", "OK", "DONE", "FINISHED") processed_data <- backup_data %>% mutate( COMPLETION_TIME = tryCatch( as.POSIXct(COMPLETION_TIME, format="%Y-%m-%d %H:%M:%S"), error = function(e) { warning("Failed to parse COMPLETION_TIME: ", e$message) NA } ), CHECK_DATE = tryCatch( as.Date(CHECK_DATE), error = function(e) { warning("Failed to parse CHECK_DATE: ", e$message) NA } ), BACKUP_SUCCESS = case_when( grepl(paste(failure_patterns, collapse = "|"), BACKUP_TYPE, ignore.case = TRUE) ~ 0, grepl(paste(success_patterns, collapse = "|"), BACKUP_TYPE, ignore.case = TRUE) ~ 1, TRUE ~ 1 # Default to success if not explicitly a failure ), BACKUP_HOUR = ifelse(!is.na(COMPLETION_TIME), lubridate::hour(COMPLETION_TIME), NA), BACKUP_DAY = ifelse(!is.na(COMPLETION_TIME), lubridate::wday(COMPLETION_TIME, label = TRUE), NA), BACKUP_DURATION = ifelse( !is.na(COMPLETION_TIME) & !is.na(CHECK_DATE), as.numeric(difftime(COMPLETION_TIME, as.POSIXct(CHECK_DATE), units = "mins")), NA_real_ ) ) # Apply filters processed_data <- processed_data %>% filter(!is.na(COMPLETION_TIME), !is.na(BACKUP_SUCCESS)) %>% distinct() if (nrow(processed_data) == 0) { stop("After preprocessing, no valid records remain - check your data quality") } # Check for any failures - using message instead of warning if (all(processed_data$BACKUP_SUCCESS == 1)) { message("No explicit backup failures detected - analyzing duration patterns for anomalies") # Calculate duration statistics only if we have valid durations if (all(is.na(processed_data$BACKUP_DURATION))) { stop("No valid backup durations available for anomaly detection") } duration_stats <- processed_data %>% summarise( mean_duration = mean(BACKUP_DURATION, na.rm = TRUE), sd_duration = sd(BACKUP_DURATION, na.rm = TRUE) ) # Mark outliers as potential failures (only if we have variation) if (!is.na(duration_stats$sd_duration) && duration_stats$sd_duration > 0) { processed_data <- processed_data %>% mutate( DURATION_Z = (BACKUP_DURATION - duration_stats$mean_duration) / duration_stats$sd_duration, BACKUP_SUCCESS = ifelse(abs(DURATION_Z) > 3, 0, 1) ) } else { message("Insufficient duration variation for anomaly detection - all marked as successful") } } return(processed_data)}
# -----------------------------------------------------------------------------------# Process system metrics with robust handling# -----------------------------------------------------------------------------------process_system_metrics <- function(cpu_data) { if (is.null(cpu_data)) { warning("CPU data is NULL - using empty dataframe") return(data.frame( DB_NAME = character(), CHECK_DATE = as.Date(character()), AVG_CPU = numeric(), MAX_CPU = numeric(), CPU_COUNT = numeric() )) } if (nrow(cpu_data) == 0) { warning("CPU data is empty - using empty dataframe") return(data.frame( DB_NAME = character(), CHECK_DATE = as.Date(character()), AVG_CPU = numeric(), MAX_CPU = numeric(), CPU_COUNT = numeric() )) } required_cols <- c("DB_NAME", "CPU_COUNT", "TOTAL_CPU_USAGE", "CHECK_DATE") missing_cols <- setdiff(required_cols, colnames(cpu_data)) if (length(missing_cols) > 0) { warning("Missing columns in CPU data: ", paste(missing_cols, collapse = ", ")) return(data.frame( DB_NAME = character(), CHECK_DATE = as.Date(character()), AVG_CPU = numeric(), MAX_CPU = numeric(), CPU_COUNT = numeric() )) } cpu_avg <- cpu_data %>% mutate( CHECK_DATE = tryCatch( as.Date(CHECK_DATE), error = function(e) { warning("Failed to parse CHECK_DATE in CPU data: ", e$message) NA } ), TOTAL_CPU_USAGE = tryCatch( as.numeric(TOTAL_CPU_USAGE), error = function(e) { warning("Failed to convert TOTAL_CPU_USAGE to numeric: ", e$message) NA } ), CPU_COUNT = tryCatch( as.numeric(CPU_COUNT), error = function(e) { warning("Failed to convert CPU_COUNT to numeric: ", e$message) NA } ) ) %>% filter(!is.na(CHECK_DATE), !is.na(TOTAL_CPU_USAGE), !is.na(CPU_COUNT)) %>% group_by(DB_NAME, CHECK_DATE) %>% summarise( AVG_CPU = mean(TOTAL_CPU_USAGE, na.rm = TRUE), MAX_CPU = max(TOTAL_CPU_USAGE, na.rm = TRUE), CPU_COUNT = first(CPU_COUNT), .groups = "drop" ) return(cpu_avg)}
# -----------------------------------------------------------------------------------# Prepare analysis data with comprehensive checks# -----------------------------------------------------------------------------------prepare_analysis_data <- function(backup_data, cpu_avg) { if (nrow(backup_data) == 0) { stop("No backup data available for analysis") } analysis_data <- backup_data %>% left_join(cpu_avg, by = c("DB_NAME", "CHECK_DATE")) %>% mutate( AVG_CPU = ifelse(is.na(AVG_CPU), median(AVG_CPU, na.rm = TRUE), AVG_CPU), CPU_COUNT = ifelse(is.na(CPU_COUNT), median(CPU_COUNT, na.rm = TRUE), CPU_COUNT) ) # Determine target variable has_failures <- any(backup_data$BACKUP_SUCCESS == 0, na.rm = TRUE) has_durations <- any(!is.na(backup_data$BACKUP_DURATION)) if (has_failures) { message("Using backup success/failure as target variable") analysis_data <- analysis_data %>% mutate(TARGET = BACKUP_SUCCESS) %>% filter(!is.na(TARGET)) } else if (has_durations) { message("No failures detected - using backup duration as target variable") analysis_data <- analysis_data %>% mutate(TARGET = BACKUP_DURATION) %>% filter(!is.na(TARGET)) } else { stop("No valid target variable available - need either failures or durations") } if (nrow(analysis_data) == 0) { stop("No valid records remaining after target variable preparation") } return(analysis_data)}
# -----------------------------------------------------------------------------------# Enhanced model training with validation# -----------------------------------------------------------------------------------train_model <- function(train_data) { if (nrow(train_data) == 0) { stop("No training data available") } features <- c("BACKUP_HOUR", "BACKUP_DAY", "AVG_CPU", "CPU_COUNT") missing_features <- setdiff(features, colnames(train_data)) if (length(missing_features) > 0) { stop("Missing required features: ", paste(missing_features, collapse = ", ")) } # Prepare model matrix with error handling formula <- as.formula(paste("~", paste(features, collapse = " + "), "- 1")) train_matrix <- tryCatch( model.matrix(formula, data = train_data), error = function(e) { stop("Failed to create model matrix: ", e$message) } ) if (all(train_data$TARGET %in% c(0, 1))) { # Classification message("Training classification model") params <- list( objective = "binary:logistic", eval_metric = "logloss", max_depth = 6, eta = 0.1 ) } else { # Regression message("Training regression model for backup duration") params <- list( objective = "reg:squarederror", eval_metric = "rmse", max_depth = 6, eta = 0.1 ) } model <- tryCatch( xgboost( data = train_matrix, label = train_data$TARGET, params = params, nrounds = 100, verbose = 0 ), error = function(e) { stop("Model training failed: ", e$message) } ) return(list( model = model, is_classification = all(train_data$TARGET %in% c(0, 1)), features = features ))}
# -----------------------------------------------------------------------------------# Comprehensive model evaluation# -----------------------------------------------------------------------------------evaluate_model <- function(model_result, test_data) { tryCatch({ features <- model_result$features missing_features <- setdiff(features, colnames(test_data)) if (length(missing_features) > 0) { stop("Missing features in test data: ", paste(missing_features, collapse = ", ")) } formula <- as.formula(paste("~", paste(features, collapse = " + "), "- 1")) test_matrix <- model.matrix(formula, data = test_data) predictions <- predict(model_result$model, test_matrix) if (model_result$is_classification) { # Classification metrics pred_classes <- as.factor(ifelse(predictions > 0.5, 1, 0)) true_labels <- as.factor(test_data$TARGET) levels(pred_classes) <- levels(true_labels) <- c("0", "1") conf_matrix <- caret::confusionMatrix(pred_classes, true_labels, positive = "1") return(list( type = "classification", metrics = conf_matrix, predictions = data.frame( Actual = test_data$TARGET, Predicted = predictions, Class = pred_classes ) )) } else { # Regression metrics actuals <- test_data$TARGET rmse <- sqrt(mean((actuals - predictions)^2)) mae <- mean(abs(actuals - predictions)) r_squared <- cor(actuals, predictions)^2 return(list( type = "regression", metrics = list(rmse = rmse, mae = mae, r_squared = r_squared), predictions = data.frame( Actual = actuals, Predicted = predictions ) )) } }, error = function(e) { warning("Model evaluation failed: ", e$message) return(NULL) })}
# -----------------------------------------------------------------------------------# Create visualizations for the report# -----------------------------------------------------------------------------------create_visualizations <- function(backup_data, analysis_data, model_eval) { plots <- list() # Backup success distribution if ("BACKUP_SUCCESS" %in% colnames(backup_data)) { success_plot <- ggplot(backup_data, aes(x = factor(BACKUP_SUCCESS), fill = factor(BACKUP_SUCCESS))) + geom_bar() + labs(title = "Backup Success Distribution", x = "Backup Status", y = "Count") + scale_fill_manual(values = c("1" = "green4", "0" = "red2"), labels = c("Failure", "Success")) + theme_minimal() plots$success_plot <- success_plot } # Backup duration by hour if ("BACKUP_HOUR" %in% colnames(backup_data) && "BACKUP_DURATION" %in% colnames(backup_data)) { duration_plot <- ggplot(backup_data, aes(x = BACKUP_HOUR, y = BACKUP_DURATION)) + geom_boxplot(aes(group = BACKUP_HOUR), fill = "lightblue") + labs(title = "Backup Duration by Hour of Day", x = "Hour of Day", y = "Duration (minutes)") + theme_minimal() plots$duration_plot <- duration_plot } # CPU usage vs backup success if ("AVG_CPU" %in% colnames(analysis_data) && "TARGET" %in% colnames(analysis_data)) { if (model_eval$type == "classification") { cpu_plot <- ggplot(analysis_data, aes(x = AVG_CPU, fill = factor(TARGET))) + geom_density(alpha = 0.5) + labs(title = "CPU Usage by Backup Status", x = "Average CPU Usage", y = "Density") + scale_fill_manual(values = c("1" = "green4", "0" = "red2"), labels = c("Failure", "Success")) + theme_minimal() } else { cpu_plot <- ggplot(analysis_data, aes(x = AVG_CPU, y = TARGET)) + geom_point(alpha = 0.5) + geom_smooth(method = "lm", color = "blue") + labs(title = "CPU Usage vs Backup Duration", x = "Average CPU Usage", y = "Duration (minutes)") + theme_minimal() } plots$cpu_plot <- cpu_plot } # Model performance visualization if (!is.null(model_eval)) { if (model_eval$type == "classification") { # Confusion matrix plot conf_data <- as.data.frame(model_eval$metrics$table) conf_plot <- ggplot(conf_data, aes(x = Reference, y = Prediction, fill = Freq)) + geom_tile() + geom_text(aes(label = Freq), color = "white") + scale_fill_gradient(low = "blue", high = "red") + labs(title = "Confusion Matrix", x = "Actual", y = "Predicted") + theme_minimal() plots$conf_plot <- conf_plot } else { # Regression prediction vs actual plot pred_plot <- ggplot(model_eval$predictions, aes(x = Actual, y = Predicted)) + geom_point(alpha = 0.5) + geom_abline(slope = 1, intercept = 0, color = "red") + labs(title = "Actual vs Predicted Backup Duration", subtitle = paste("RMSE:", round(model_eval$metrics$rmse, 2), "R-squared:", round(model_eval$metrics$r_squared, 2))) + theme_minimal() plots$pred_plot <- pred_plot } } return(plots)}
# -----------------------------------------------------------------------------------# Generate the PDF report# -----------------------------------------------------------------------------------generate_report <- function(backup_data, analysis_data, model_eval, plots, filename, output_format) { # Create a temporary Rmd file temp_rmd <- tempfile(fileext = ".Rmd") # Determine output format and set YAML accordingly if (output_format == "pdf") { yaml_output <- "pdf_document" } else { yaml_output <- "html_document" } # Write the Rmd content writeLines(c( "---", paste0("title: 'Database Backup Analysis Report'"), "author: 'Automated Analysis'", paste0("date: '", format(Sys.time(), "%B %d, %Y"), "'"), paste0("output: ", yaml_output), "---", "", "```{r setup, include=FALSE}", "knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)", "```", "", "# Database Backup Analysis Report", "", "## Data Overview", "", "```{r data-overview}", "cat(paste('Total backup records analyzed:', nrow(backup_data)))", "if ('BACKUP_SUCCESS' %in% colnames(backup_data)) {", " cat(paste('\\nSuccess rate:', round(mean(backup_data$BACKUP_SUCCESS) * 100, 1), '%'))", "}", "if ('BACKUP_DURATION' %in% colnames(backup_data)) {", " cat(paste('\\nAverage backup duration:', round(mean(backup_data$BACKUP_DURATION, na.rm = TRUE), 1), 'minutes'))", "}", "```", "", "## Visualizations", "", "```{r plots}", "if (!is.null(plots$success_plot)) print(plots$success_plot)", "if (!is.null(plots$duration_plot)) print(plots$duration_plot)", "if (!is.null(plots$cpu_plot)) print(plots$cpu_plot)", "```", "", "## Model Performance", "", "```{r model-performance}", "if (!is.null(model_eval)) {", " if (model_eval$type == 'classification') {", " cat('### Classification Performance\\n\\n')", " print(model_eval$metrics$table)", " cat(paste('\\n\\nAccuracy:', round(model_eval$metrics$overall['Accuracy'], 4)))", " if (!is.null(plots$conf_plot)) print(plots$conf_plot)", " } else {", " cat('### Regression Performance\\n\\n')", " cat(paste('RMSE:', round(model_eval$metrics$rmse, 2), 'minutes\\n'))", " cat(paste('MAE:', round(model_eval$metrics$mae, 2), 'minutes\\n'))", " cat(paste('R-squared:', round(model_eval$metrics$r_squared, 2), '\\n\\n'))", " if (!is.null(plots$pred_plot)) print(plots$pred_plot)", " }", "}", "```", "", "## Recommendations", "", "1. **Optimal Backup Timing**: Based on the analysis, consider scheduling backups during hours with historically higher success rates and shorter durations.", "2. **Resource Allocation**: Monitor CPU usage during backup windows and consider allocating additional resources if correlations are found.", "3. **Anomaly Detection**: Implement automated alerts for backup durations that fall outside expected ranges.", "", paste0("### Generated on ", format(Sys.time(), "%B %d, %Y at %H:%M:%S")) ), temp_rmd) # Render the report rmarkdown::render( input = temp_rmd, output_file = filename, quiet = TRUE ) # Clean up the temporary file unlink(temp_rmd)}
# -----------------------------------------------------------------------------------# Main execution function# -----------------------------------------------------------------------------------main <- function() { tryCatch({ # Check report format capability output_format <- check_pandoc() # Generate appropriate report filename report_filename <- generate_report_filename(output_format) # Connect to database conn <- connect_to_database() # Query backup data (this is the missing part) backup_query <- "SELECT DB_NAME, BACKUP_TYPE, COMPLETION_TIME, CHECK_DATE FROM ATTSTATS.BACKUP_STATUS" backup_data <- query_data(conn, backup_query) # Query CPU data cpu_query <- "SELECT DB_NAME, CPU_COUNT, TOTAL_CPU_USAGE, CHECK_DATE FROM ATTSTATS.CPU_USAGE" cpu_data <- query_data(conn, cpu_query) # Preprocess data backup_data <- preprocess_backup_data(backup_data) cpu_avg <- process_system_metrics(cpu_data) # Prepare analysis data backup_analysis <- prepare_analysis_data(backup_data, cpu_avg) # Split data for modeling set.seed(42) train_index <- createDataPartition(backup_analysis$TARGET, p = 0.8, list = FALSE) train_data <- backup_analysis[train_index, ] test_data <- backup_analysis[-train_index, ] # Train and evaluate model model_result <- train_model(train_data) model_eval <- evaluate_model(model_result, test_data) # Create visualizations plots <- create_visualizations(backup_data, backup_analysis, model_eval) # Generate report with the appropriate format generate_report(backup_data, backup_analysis, model_eval, plots, report_filename, output_format) message("\nReport generated successfully: ", report_filename) # Close database connection if (exists("conn") && dbIsValid(conn)) { dbDisconnect(conn) message("Database connection closed") } }, error = function(e) { message("Error in main execution: ", e$message) if (exists("conn") && dbIsValid(conn)) { dbDisconnect(conn) message("Database connection closed") } stop("Script execution failed") })}
# -----------------------------------------------------------------------------------# Execute the main function# -----------------------------------------------------------------------------------main()
# -----------------------------------------------------------------------------------# End of Script# -----------------------------------------------------------------------------------message("Script execution completed successfully.")# -----------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------# Load required packages with version checks and quiet loading# -----------------------------------------------------------------------------------required_packages <- c("DBI", "odbc", "dplyr", "lubridate", "ggplot2", "caret", "xgboost", "tidyr", "isotree", "rmarkdown", "knitr", "gridExtra")invisible(lapply(required_packages, function(pkg) { if (!require(pkg, character.only = TRUE, quietly = TRUE)) { install.packages(pkg, quiet = TRUE) library(pkg, character.only = TRUE) }}))
# Install tinytex for PDF generation if not available# install.packages('tinytex')# tinytex::install_tinytex()
# -----------------------------------------------------------------------------------# Function to check Pandoc availability with fallback# -----------------------------------------------------------------------------------check_pandoc <- function() { if (!rmarkdown::pandoc_available()) { message("Pandoc not found - attempting to install...") tryCatch({ rmarkdown::install_pandoc() if (!rmarkdown::pandoc_available()) { message("PDF generation unavailable - falling back to HTML") return("html") } return("pdf") }, error = function(e) { message("Could not install Pandoc: ", e$message) message("Falling back to HTML output") return("html") }) } else { return("pdf") }}
# -----------------------------------------------------------------------------------# Create a unique filename for the report with full path# -----------------------------------------------------------------------------------generate_report_filename <- function(output_format) { # Define the fixed output directory output_dir <- "F:\\DBA\\Scripts\\R\\Reports" # Create directory if it doesn't exist (with recursive = TRUE to create full path) if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE) } timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S") filename <- paste0("Backup_Analysis_Report_", timestamp, ".", output_format) # Return full path (using file.path for cross-platform compatibility) file.path(output_dir, filename)}
# -----------------------------------------------------------------------------------# Enhanced database connection with validation# -----------------------------------------------------------------------------------connect_to_database <- function() { tryCatch({ conn <- dbConnect(odbc::odbc(), dsn = "[DB_NAME]", uid = "[USERNAME]", pwd = "[PASSWORD]") timeout = 10) message("Successfully connected to database") # Verify connection works with a simple query test_query <- "SELECT 1 FROM DUAL" test_result <- dbGetQuery(conn, test_query) if (nrow(test_result) != 1) { stop("Test query failed - connection validation unsuccessful") } return(conn) }, error = function(e) { stop("Database connection failed: ", e$message) })}
# -----------------------------------------------------------------------------------# Robust query function with schema and data validation# -----------------------------------------------------------------------------------query_data <- function(conn, query) { tryCatch({ result <- dbGetQuery(conn, query) if (nrow(result) == 0) { warning("Query returned empty result: ", substr(query, 1, 30), "...") # Return empty dataframe with correct structure col_info <- tryCatch( dbColumnInfo(dbSendQuery(conn, query)), error = function(e) NULL ) if (is.null(col_info)) { return(data.frame()) } return(data.frame(matrix(ncol = nrow(col_info), nrow = 0))) } message("Successfully queried ", nrow(result), " rows from: ", substr(query, 1, 30), "...") return(result) }, error = function(e) { warning("Query '", substr(query, 1, 30), "...' failed: ", e$message) return(NULL) })}
# -----------------------------------------------------------------------------------# Enhanced preprocessing with muted warning for no failures# -----------------------------------------------------------------------------------preprocess_backup_data <- function(backup_data) { if (is.null(backup_data)) { stop("Backup data is NULL - check your database connection and query") } if (nrow(backup_data) == 0) { stop("Backup data is empty - no records returned from query") } required_cols <- c("DB_NAME", "BACKUP_TYPE", "COMPLETION_TIME", "CHECK_DATE") missing_cols <- setdiff(required_cols, colnames(backup_data)) if (length(missing_cols) > 0) { stop("Missing required columns: ", paste(missing_cols, collapse = ", ")) } # Extended failure patterns failure_patterns <- c("FAILED", "ERROR", "ABORTED", "STOPPED", "UNSUCCESSFUL", "PARTIAL", "TIMEOUT") success_patterns <- c("COMPLETED", "SUCCESS", "OK", "DONE", "FINISHED") processed_data <- backup_data %>% mutate( COMPLETION_TIME = tryCatch( as.POSIXct(COMPLETION_TIME, format="%Y-%m-%d %H:%M:%S"), error = function(e) { warning("Failed to parse COMPLETION_TIME: ", e$message) NA } ), CHECK_DATE = tryCatch( as.Date(CHECK_DATE), error = function(e) { warning("Failed to parse CHECK_DATE: ", e$message) NA } ), BACKUP_SUCCESS = case_when( grepl(paste(failure_patterns, collapse = "|"), BACKUP_TYPE, ignore.case = TRUE) ~ 0, grepl(paste(success_patterns, collapse = "|"), BACKUP_TYPE, ignore.case = TRUE) ~ 1, TRUE ~ 1 # Default to success if not explicitly a failure ), BACKUP_HOUR = ifelse(!is.na(COMPLETION_TIME), lubridate::hour(COMPLETION_TIME), NA), BACKUP_DAY = ifelse(!is.na(COMPLETION_TIME), lubridate::wday(COMPLETION_TIME, label = TRUE), NA), BACKUP_DURATION = ifelse( !is.na(COMPLETION_TIME) & !is.na(CHECK_DATE), as.numeric(difftime(COMPLETION_TIME, as.POSIXct(CHECK_DATE), units = "mins")), NA_real_ ) ) # Apply filters processed_data <- processed_data %>% filter(!is.na(COMPLETION_TIME), !is.na(BACKUP_SUCCESS)) %>% distinct() if (nrow(processed_data) == 0) { stop("After preprocessing, no valid records remain - check your data quality") } # Check for any failures - using message instead of warning if (all(processed_data$BACKUP_SUCCESS == 1)) { message("No explicit backup failures detected - analyzing duration patterns for anomalies") # Calculate duration statistics only if we have valid durations if (all(is.na(processed_data$BACKUP_DURATION))) { stop("No valid backup durations available for anomaly detection") } duration_stats <- processed_data %>% summarise( mean_duration = mean(BACKUP_DURATION, na.rm = TRUE), sd_duration = sd(BACKUP_DURATION, na.rm = TRUE) ) # Mark outliers as potential failures (only if we have variation) if (!is.na(duration_stats$sd_duration) && duration_stats$sd_duration > 0) { processed_data <- processed_data %>% mutate( DURATION_Z = (BACKUP_DURATION - duration_stats$mean_duration) / duration_stats$sd_duration, BACKUP_SUCCESS = ifelse(abs(DURATION_Z) > 3, 0, 1) ) } else { message("Insufficient duration variation for anomaly detection - all marked as successful") } } return(processed_data)}
# -----------------------------------------------------------------------------------# Process system metrics with robust handling# -----------------------------------------------------------------------------------process_system_metrics <- function(cpu_data) { if (is.null(cpu_data)) { warning("CPU data is NULL - using empty dataframe") return(data.frame( DB_NAME = character(), CHECK_DATE = as.Date(character()), AVG_CPU = numeric(), MAX_CPU = numeric(), CPU_COUNT = numeric() )) } if (nrow(cpu_data) == 0) { warning("CPU data is empty - using empty dataframe") return(data.frame( DB_NAME = character(), CHECK_DATE = as.Date(character()), AVG_CPU = numeric(), MAX_CPU = numeric(), CPU_COUNT = numeric() )) } required_cols <- c("DB_NAME", "CPU_COUNT", "TOTAL_CPU_USAGE", "CHECK_DATE") missing_cols <- setdiff(required_cols, colnames(cpu_data)) if (length(missing_cols) > 0) { warning("Missing columns in CPU data: ", paste(missing_cols, collapse = ", ")) return(data.frame( DB_NAME = character(), CHECK_DATE = as.Date(character()), AVG_CPU = numeric(), MAX_CPU = numeric(), CPU_COUNT = numeric() )) } cpu_avg <- cpu_data %>% mutate( CHECK_DATE = tryCatch( as.Date(CHECK_DATE), error = function(e) { warning("Failed to parse CHECK_DATE in CPU data: ", e$message) NA } ), TOTAL_CPU_USAGE = tryCatch( as.numeric(TOTAL_CPU_USAGE), error = function(e) { warning("Failed to convert TOTAL_CPU_USAGE to numeric: ", e$message) NA } ), CPU_COUNT = tryCatch( as.numeric(CPU_COUNT), error = function(e) { warning("Failed to convert CPU_COUNT to numeric: ", e$message) NA } ) ) %>% filter(!is.na(CHECK_DATE), !is.na(TOTAL_CPU_USAGE), !is.na(CPU_COUNT)) %>% group_by(DB_NAME, CHECK_DATE) %>% summarise( AVG_CPU = mean(TOTAL_CPU_USAGE, na.rm = TRUE), MAX_CPU = max(TOTAL_CPU_USAGE, na.rm = TRUE), CPU_COUNT = first(CPU_COUNT), .groups = "drop" ) return(cpu_avg)}
# -----------------------------------------------------------------------------------# Prepare analysis data with comprehensive checks# -----------------------------------------------------------------------------------prepare_analysis_data <- function(backup_data, cpu_avg) { if (nrow(backup_data) == 0) { stop("No backup data available for analysis") } analysis_data <- backup_data %>% left_join(cpu_avg, by = c("DB_NAME", "CHECK_DATE")) %>% mutate( AVG_CPU = ifelse(is.na(AVG_CPU), median(AVG_CPU, na.rm = TRUE), AVG_CPU), CPU_COUNT = ifelse(is.na(CPU_COUNT), median(CPU_COUNT, na.rm = TRUE), CPU_COUNT) ) # Determine target variable has_failures <- any(backup_data$BACKUP_SUCCESS == 0, na.rm = TRUE) has_durations <- any(!is.na(backup_data$BACKUP_DURATION)) if (has_failures) { message("Using backup success/failure as target variable") analysis_data <- analysis_data %>% mutate(TARGET = BACKUP_SUCCESS) %>% filter(!is.na(TARGET)) } else if (has_durations) { message("No failures detected - using backup duration as target variable") analysis_data <- analysis_data %>% mutate(TARGET = BACKUP_DURATION) %>% filter(!is.na(TARGET)) } else { stop("No valid target variable available - need either failures or durations") } if (nrow(analysis_data) == 0) { stop("No valid records remaining after target variable preparation") } return(analysis_data)}
# -----------------------------------------------------------------------------------# Enhanced model training with validation# -----------------------------------------------------------------------------------train_model <- function(train_data) { if (nrow(train_data) == 0) { stop("No training data available") } features <- c("BACKUP_HOUR", "BACKUP_DAY", "AVG_CPU", "CPU_COUNT") missing_features <- setdiff(features, colnames(train_data)) if (length(missing_features) > 0) { stop("Missing required features: ", paste(missing_features, collapse = ", ")) } # Prepare model matrix with error handling formula <- as.formula(paste("~", paste(features, collapse = " + "), "- 1")) train_matrix <- tryCatch( model.matrix(formula, data = train_data), error = function(e) { stop("Failed to create model matrix: ", e$message) } ) if (all(train_data$TARGET %in% c(0, 1))) { # Classification message("Training classification model") params <- list( objective = "binary:logistic", eval_metric = "logloss", max_depth = 6, eta = 0.1 ) } else { # Regression message("Training regression model for backup duration") params <- list( objective = "reg:squarederror", eval_metric = "rmse", max_depth = 6, eta = 0.1 ) } model <- tryCatch( xgboost( data = train_matrix, label = train_data$TARGET, params = params, nrounds = 100, verbose = 0 ), error = function(e) { stop("Model training failed: ", e$message) } ) return(list( model = model, is_classification = all(train_data$TARGET %in% c(0, 1)), features = features ))}
# -----------------------------------------------------------------------------------# Comprehensive model evaluation# -----------------------------------------------------------------------------------evaluate_model <- function(model_result, test_data) { tryCatch({ features <- model_result$features missing_features <- setdiff(features, colnames(test_data)) if (length(missing_features) > 0) { stop("Missing features in test data: ", paste(missing_features, collapse = ", ")) } formula <- as.formula(paste("~", paste(features, collapse = " + "), "- 1")) test_matrix <- model.matrix(formula, data = test_data) predictions <- predict(model_result$model, test_matrix) if (model_result$is_classification) { # Classification metrics pred_classes <- as.factor(ifelse(predictions > 0.5, 1, 0)) true_labels <- as.factor(test_data$TARGET) levels(pred_classes) <- levels(true_labels) <- c("0", "1") conf_matrix <- caret::confusionMatrix(pred_classes, true_labels, positive = "1") return(list( type = "classification", metrics = conf_matrix, predictions = data.frame( Actual = test_data$TARGET, Predicted = predictions, Class = pred_classes ) )) } else { # Regression metrics actuals <- test_data$TARGET rmse <- sqrt(mean((actuals - predictions)^2)) mae <- mean(abs(actuals - predictions)) r_squared <- cor(actuals, predictions)^2 return(list( type = "regression", metrics = list(rmse = rmse, mae = mae, r_squared = r_squared), predictions = data.frame( Actual = actuals, Predicted = predictions ) )) } }, error = function(e) { warning("Model evaluation failed: ", e$message) return(NULL) })}
# -----------------------------------------------------------------------------------# Create visualizations for the report# -----------------------------------------------------------------------------------create_visualizations <- function(backup_data, analysis_data, model_eval) { plots <- list() # Backup success distribution if ("BACKUP_SUCCESS" %in% colnames(backup_data)) { success_plot <- ggplot(backup_data, aes(x = factor(BACKUP_SUCCESS), fill = factor(BACKUP_SUCCESS))) + geom_bar() + labs(title = "Backup Success Distribution", x = "Backup Status", y = "Count") + scale_fill_manual(values = c("1" = "green4", "0" = "red2"), labels = c("Failure", "Success")) + theme_minimal() plots$success_plot <- success_plot } # Backup duration by hour if ("BACKUP_HOUR" %in% colnames(backup_data) && "BACKUP_DURATION" %in% colnames(backup_data)) { duration_plot <- ggplot(backup_data, aes(x = BACKUP_HOUR, y = BACKUP_DURATION)) + geom_boxplot(aes(group = BACKUP_HOUR), fill = "lightblue") + labs(title = "Backup Duration by Hour of Day", x = "Hour of Day", y = "Duration (minutes)") + theme_minimal() plots$duration_plot <- duration_plot } # CPU usage vs backup success if ("AVG_CPU" %in% colnames(analysis_data) && "TARGET" %in% colnames(analysis_data)) { if (model_eval$type == "classification") { cpu_plot <- ggplot(analysis_data, aes(x = AVG_CPU, fill = factor(TARGET))) + geom_density(alpha = 0.5) + labs(title = "CPU Usage by Backup Status", x = "Average CPU Usage", y = "Density") + scale_fill_manual(values = c("1" = "green4", "0" = "red2"), labels = c("Failure", "Success")) + theme_minimal() } else { cpu_plot <- ggplot(analysis_data, aes(x = AVG_CPU, y = TARGET)) + geom_point(alpha = 0.5) + geom_smooth(method = "lm", color = "blue") + labs(title = "CPU Usage vs Backup Duration", x = "Average CPU Usage", y = "Duration (minutes)") + theme_minimal() } plots$cpu_plot <- cpu_plot } # Model performance visualization if (!is.null(model_eval)) { if (model_eval$type == "classification") { # Confusion matrix plot conf_data <- as.data.frame(model_eval$metrics$table) conf_plot <- ggplot(conf_data, aes(x = Reference, y = Prediction, fill = Freq)) + geom_tile() + geom_text(aes(label = Freq), color = "white") + scale_fill_gradient(low = "blue", high = "red") + labs(title = "Confusion Matrix", x = "Actual", y = "Predicted") + theme_minimal() plots$conf_plot <- conf_plot } else { # Regression prediction vs actual plot pred_plot <- ggplot(model_eval$predictions, aes(x = Actual, y = Predicted)) + geom_point(alpha = 0.5) + geom_abline(slope = 1, intercept = 0, color = "red") + labs(title = "Actual vs Predicted Backup Duration", subtitle = paste("RMSE:", round(model_eval$metrics$rmse, 2), "R-squared:", round(model_eval$metrics$r_squared, 2))) + theme_minimal() plots$pred_plot <- pred_plot } } return(plots)}
# -----------------------------------------------------------------------------------# Generate the PDF report# -----------------------------------------------------------------------------------generate_report <- function(backup_data, analysis_data, model_eval, plots, filename, output_format) { # Create a temporary Rmd file temp_rmd <- tempfile(fileext = ".Rmd") # Determine output format and set YAML accordingly if (output_format == "pdf") { yaml_output <- "pdf_document" } else { yaml_output <- "html_document" } # Write the Rmd content writeLines(c( "---", paste0("title: 'Database Backup Analysis Report'"), "author: 'Automated Analysis'", paste0("date: '", format(Sys.time(), "%B %d, %Y"), "'"), paste0("output: ", yaml_output), "---", "", "```{r setup, include=FALSE}", "knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)", "```", "", "# Database Backup Analysis Report", "", "## Data Overview", "", "```{r data-overview}", "cat(paste('Total backup records analyzed:', nrow(backup_data)))", "if ('BACKUP_SUCCESS' %in% colnames(backup_data)) {", " cat(paste('\\nSuccess rate:', round(mean(backup_data$BACKUP_SUCCESS) * 100, 1), '%'))", "}", "if ('BACKUP_DURATION' %in% colnames(backup_data)) {", " cat(paste('\\nAverage backup duration:', round(mean(backup_data$BACKUP_DURATION, na.rm = TRUE), 1), 'minutes'))", "}", "```", "", "## Visualizations", "", "```{r plots}", "if (!is.null(plots$success_plot)) print(plots$success_plot)", "if (!is.null(plots$duration_plot)) print(plots$duration_plot)", "if (!is.null(plots$cpu_plot)) print(plots$cpu_plot)", "```", "", "## Model Performance", "", "```{r model-performance}", "if (!is.null(model_eval)) {", " if (model_eval$type == 'classification') {", " cat('### Classification Performance\\n\\n')", " print(model_eval$metrics$table)", " cat(paste('\\n\\nAccuracy:', round(model_eval$metrics$overall['Accuracy'], 4)))", " if (!is.null(plots$conf_plot)) print(plots$conf_plot)", " } else {", " cat('### Regression Performance\\n\\n')", " cat(paste('RMSE:', round(model_eval$metrics$rmse, 2), 'minutes\\n'))", " cat(paste('MAE:', round(model_eval$metrics$mae, 2), 'minutes\\n'))", " cat(paste('R-squared:', round(model_eval$metrics$r_squared, 2), '\\n\\n'))", " if (!is.null(plots$pred_plot)) print(plots$pred_plot)", " }", "}", "```", "", "## Recommendations", "", "1. **Optimal Backup Timing**: Based on the analysis, consider scheduling backups during hours with historically higher success rates and shorter durations.", "2. **Resource Allocation**: Monitor CPU usage during backup windows and consider allocating additional resources if correlations are found.", "3. **Anomaly Detection**: Implement automated alerts for backup durations that fall outside expected ranges.", "", paste0("### Generated on ", format(Sys.time(), "%B %d, %Y at %H:%M:%S")) ), temp_rmd) # Render the report rmarkdown::render( input = temp_rmd, output_file = filename, quiet = TRUE ) # Clean up the temporary file unlink(temp_rmd)}
# -----------------------------------------------------------------------------------# Main execution function# -----------------------------------------------------------------------------------main <- function() { tryCatch({ # Check report format capability output_format <- check_pandoc() # Generate appropriate report filename report_filename <- generate_report_filename(output_format) # Connect to database conn <- connect_to_database() # Query backup data (this is the missing part) backup_query <- "SELECT DB_NAME, BACKUP_TYPE, COMPLETION_TIME, CHECK_DATE FROM ATTSTATS.BACKUP_STATUS" backup_data <- query_data(conn, backup_query) # Query CPU data cpu_query <- "SELECT DB_NAME, CPU_COUNT, TOTAL_CPU_USAGE, CHECK_DATE FROM ATTSTATS.CPU_USAGE" cpu_data <- query_data(conn, cpu_query) # Preprocess data backup_data <- preprocess_backup_data(backup_data) cpu_avg <- process_system_metrics(cpu_data) # Prepare analysis data backup_analysis <- prepare_analysis_data(backup_data, cpu_avg) # Split data for modeling set.seed(42) train_index <- createDataPartition(backup_analysis$TARGET, p = 0.8, list = FALSE) train_data <- backup_analysis[train_index, ] test_data <- backup_analysis[-train_index, ] # Train and evaluate model model_result <- train_model(train_data) model_eval <- evaluate_model(model_result, test_data) # Create visualizations plots <- create_visualizations(backup_data, backup_analysis, model_eval) # Generate report with the appropriate format generate_report(backup_data, backup_analysis, model_eval, plots, report_filename, output_format) message("\nReport generated successfully: ", report_filename) # Close database connection if (exists("conn") && dbIsValid(conn)) { dbDisconnect(conn) message("Database connection closed") } }, error = function(e) { message("Error in main execution: ", e$message) if (exists("conn") && dbIsValid(conn)) { dbDisconnect(conn) message("Database connection closed") } stop("Script execution failed") })}
# -----------------------------------------------------------------------------------# Execute the main function# -----------------------------------------------------------------------------------main()
# -----------------------------------------------------------------------------------# End of Script# -----------------------------------------------------------------------------------message("Script execution completed successfully.")# -----------------------------------------------------------------------------------