• HOME
  • SERVICES
    • DATABASE SERVICES
    • CYBERSECURITY
    • INTERNET OF THINGS
  • PUBLISHING
    • PORTFOLIO
    • ORACLE SCRIPT LIBRARY
    • R CODE SCRIPT LIBRARY
  • ABOUT
    • ABOUT MONTBLEAU.CA
    • About Pierre Montbleau
  • Blog
  • Academic Press
    • Bookshelf

2-DB_Backup_Failure.r

# -----------------------------------------------------------------------------------# File Name : 2-DB_Backup_Failure.r# Author : Pierre Montbleau# Description : Database Backup Failure Analysis and Predictive Modeling Script# Purpose : Analyze backup success/failure patterns, build predictive models,# and generate comprehensive reports with recommendations# Call Syntax : source("F:\\DBA\\Scripts\\R\\2-DB_Backup_Failure.r")# Parameters : None - all parameters configured internally (output format auto-detected)# Last Modified: 2024-01-15# -----------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------# Load required packages with version checks and quiet loading# -----------------------------------------------------------------------------------required_packages <- c("DBI", "odbc", "dplyr", "lubridate", "ggplot2", "caret", "xgboost", "tidyr", "isotree", "rmarkdown", "knitr", "gridExtra")invisible(lapply(required_packages, function(pkg) { if (!require(pkg, character.only = TRUE, quietly = TRUE)) { install.packages(pkg, quiet = TRUE) library(pkg, character.only = TRUE) }}))
# Install tinytex for PDF generation if not available# install.packages('tinytex')# tinytex::install_tinytex()
# -----------------------------------------------------------------------------------# Function to check Pandoc availability with fallback# -----------------------------------------------------------------------------------check_pandoc <- function() { if (!rmarkdown::pandoc_available()) { message("Pandoc not found - attempting to install...") tryCatch({ rmarkdown::install_pandoc() if (!rmarkdown::pandoc_available()) { message("PDF generation unavailable - falling back to HTML") return("html") } return("pdf") }, error = function(e) { message("Could not install Pandoc: ", e$message) message("Falling back to HTML output") return("html") }) } else { return("pdf") }}
# -----------------------------------------------------------------------------------# Create a unique filename for the report with full path# -----------------------------------------------------------------------------------generate_report_filename <- function(output_format) { # Define the fixed output directory output_dir <- "F:\\DBA\\Scripts\\R\\Reports" # Create directory if it doesn't exist (with recursive = TRUE to create full path) if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE) } timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S") filename <- paste0("Backup_Analysis_Report_", timestamp, ".", output_format) # Return full path (using file.path for cross-platform compatibility) file.path(output_dir, filename)}
# -----------------------------------------------------------------------------------# Enhanced database connection with validation# -----------------------------------------------------------------------------------connect_to_database <- function() { tryCatch({ conn <- dbConnect(odbc::odbc(), dsn = "[DB_NAME]", uid = "[USERNAME]", pwd = "[PASSWORD]") timeout = 10) message("Successfully connected to database") # Verify connection works with a simple query test_query <- "SELECT 1 FROM DUAL" test_result <- dbGetQuery(conn, test_query) if (nrow(test_result) != 1) { stop("Test query failed - connection validation unsuccessful") } return(conn) }, error = function(e) { stop("Database connection failed: ", e$message) })}
# -----------------------------------------------------------------------------------# Robust query function with schema and data validation# -----------------------------------------------------------------------------------query_data <- function(conn, query) { tryCatch({ result <- dbGetQuery(conn, query) if (nrow(result) == 0) { warning("Query returned empty result: ", substr(query, 1, 30), "...") # Return empty dataframe with correct structure col_info <- tryCatch( dbColumnInfo(dbSendQuery(conn, query)), error = function(e) NULL ) if (is.null(col_info)) { return(data.frame()) } return(data.frame(matrix(ncol = nrow(col_info), nrow = 0))) } message("Successfully queried ", nrow(result), " rows from: ", substr(query, 1, 30), "...") return(result) }, error = function(e) { warning("Query '", substr(query, 1, 30), "...' failed: ", e$message) return(NULL) })}
# -----------------------------------------------------------------------------------# Enhanced preprocessing with muted warning for no failures# -----------------------------------------------------------------------------------preprocess_backup_data <- function(backup_data) { if (is.null(backup_data)) { stop("Backup data is NULL - check your database connection and query") } if (nrow(backup_data) == 0) { stop("Backup data is empty - no records returned from query") } required_cols <- c("DB_NAME", "BACKUP_TYPE", "COMPLETION_TIME", "CHECK_DATE") missing_cols <- setdiff(required_cols, colnames(backup_data)) if (length(missing_cols) > 0) { stop("Missing required columns: ", paste(missing_cols, collapse = ", ")) } # Extended failure patterns failure_patterns <- c("FAILED", "ERROR", "ABORTED", "STOPPED", "UNSUCCESSFUL", "PARTIAL", "TIMEOUT") success_patterns <- c("COMPLETED", "SUCCESS", "OK", "DONE", "FINISHED") processed_data <- backup_data %>% mutate( COMPLETION_TIME = tryCatch( as.POSIXct(COMPLETION_TIME, format="%Y-%m-%d %H:%M:%S"), error = function(e) { warning("Failed to parse COMPLETION_TIME: ", e$message) NA } ), CHECK_DATE = tryCatch( as.Date(CHECK_DATE), error = function(e) { warning("Failed to parse CHECK_DATE: ", e$message) NA } ), BACKUP_SUCCESS = case_when( grepl(paste(failure_patterns, collapse = "|"), BACKUP_TYPE, ignore.case = TRUE) ~ 0, grepl(paste(success_patterns, collapse = "|"), BACKUP_TYPE, ignore.case = TRUE) ~ 1, TRUE ~ 1 # Default to success if not explicitly a failure ), BACKUP_HOUR = ifelse(!is.na(COMPLETION_TIME), lubridate::hour(COMPLETION_TIME), NA), BACKUP_DAY = ifelse(!is.na(COMPLETION_TIME), lubridate::wday(COMPLETION_TIME, label = TRUE), NA), BACKUP_DURATION = ifelse( !is.na(COMPLETION_TIME) & !is.na(CHECK_DATE), as.numeric(difftime(COMPLETION_TIME, as.POSIXct(CHECK_DATE), units = "mins")), NA_real_ ) ) # Apply filters processed_data <- processed_data %>% filter(!is.na(COMPLETION_TIME), !is.na(BACKUP_SUCCESS)) %>% distinct() if (nrow(processed_data) == 0) { stop("After preprocessing, no valid records remain - check your data quality") } # Check for any failures - using message instead of warning if (all(processed_data$BACKUP_SUCCESS == 1)) { message("No explicit backup failures detected - analyzing duration patterns for anomalies") # Calculate duration statistics only if we have valid durations if (all(is.na(processed_data$BACKUP_DURATION))) { stop("No valid backup durations available for anomaly detection") } duration_stats <- processed_data %>% summarise( mean_duration = mean(BACKUP_DURATION, na.rm = TRUE), sd_duration = sd(BACKUP_DURATION, na.rm = TRUE) ) # Mark outliers as potential failures (only if we have variation) if (!is.na(duration_stats$sd_duration) && duration_stats$sd_duration > 0) { processed_data <- processed_data %>% mutate( DURATION_Z = (BACKUP_DURATION - duration_stats$mean_duration) / duration_stats$sd_duration, BACKUP_SUCCESS = ifelse(abs(DURATION_Z) > 3, 0, 1) ) } else { message("Insufficient duration variation for anomaly detection - all marked as successful") } } return(processed_data)}
# -----------------------------------------------------------------------------------# Process system metrics with robust handling# -----------------------------------------------------------------------------------process_system_metrics <- function(cpu_data) { if (is.null(cpu_data)) { warning("CPU data is NULL - using empty dataframe") return(data.frame( DB_NAME = character(), CHECK_DATE = as.Date(character()), AVG_CPU = numeric(), MAX_CPU = numeric(), CPU_COUNT = numeric() )) } if (nrow(cpu_data) == 0) { warning("CPU data is empty - using empty dataframe") return(data.frame( DB_NAME = character(), CHECK_DATE = as.Date(character()), AVG_CPU = numeric(), MAX_CPU = numeric(), CPU_COUNT = numeric() )) } required_cols <- c("DB_NAME", "CPU_COUNT", "TOTAL_CPU_USAGE", "CHECK_DATE") missing_cols <- setdiff(required_cols, colnames(cpu_data)) if (length(missing_cols) > 0) { warning("Missing columns in CPU data: ", paste(missing_cols, collapse = ", ")) return(data.frame( DB_NAME = character(), CHECK_DATE = as.Date(character()), AVG_CPU = numeric(), MAX_CPU = numeric(), CPU_COUNT = numeric() )) } cpu_avg <- cpu_data %>% mutate( CHECK_DATE = tryCatch( as.Date(CHECK_DATE), error = function(e) { warning("Failed to parse CHECK_DATE in CPU data: ", e$message) NA } ), TOTAL_CPU_USAGE = tryCatch( as.numeric(TOTAL_CPU_USAGE), error = function(e) { warning("Failed to convert TOTAL_CPU_USAGE to numeric: ", e$message) NA } ), CPU_COUNT = tryCatch( as.numeric(CPU_COUNT), error = function(e) { warning("Failed to convert CPU_COUNT to numeric: ", e$message) NA } ) ) %>% filter(!is.na(CHECK_DATE), !is.na(TOTAL_CPU_USAGE), !is.na(CPU_COUNT)) %>% group_by(DB_NAME, CHECK_DATE) %>% summarise( AVG_CPU = mean(TOTAL_CPU_USAGE, na.rm = TRUE), MAX_CPU = max(TOTAL_CPU_USAGE, na.rm = TRUE), CPU_COUNT = first(CPU_COUNT), .groups = "drop" ) return(cpu_avg)}
# -----------------------------------------------------------------------------------# Prepare analysis data with comprehensive checks# -----------------------------------------------------------------------------------prepare_analysis_data <- function(backup_data, cpu_avg) { if (nrow(backup_data) == 0) { stop("No backup data available for analysis") } analysis_data <- backup_data %>% left_join(cpu_avg, by = c("DB_NAME", "CHECK_DATE")) %>% mutate( AVG_CPU = ifelse(is.na(AVG_CPU), median(AVG_CPU, na.rm = TRUE), AVG_CPU), CPU_COUNT = ifelse(is.na(CPU_COUNT), median(CPU_COUNT, na.rm = TRUE), CPU_COUNT) ) # Determine target variable has_failures <- any(backup_data$BACKUP_SUCCESS == 0, na.rm = TRUE) has_durations <- any(!is.na(backup_data$BACKUP_DURATION)) if (has_failures) { message("Using backup success/failure as target variable") analysis_data <- analysis_data %>% mutate(TARGET = BACKUP_SUCCESS) %>% filter(!is.na(TARGET)) } else if (has_durations) { message("No failures detected - using backup duration as target variable") analysis_data <- analysis_data %>% mutate(TARGET = BACKUP_DURATION) %>% filter(!is.na(TARGET)) } else { stop("No valid target variable available - need either failures or durations") } if (nrow(analysis_data) == 0) { stop("No valid records remaining after target variable preparation") } return(analysis_data)}
# -----------------------------------------------------------------------------------# Enhanced model training with validation# -----------------------------------------------------------------------------------train_model <- function(train_data) { if (nrow(train_data) == 0) { stop("No training data available") } features <- c("BACKUP_HOUR", "BACKUP_DAY", "AVG_CPU", "CPU_COUNT") missing_features <- setdiff(features, colnames(train_data)) if (length(missing_features) > 0) { stop("Missing required features: ", paste(missing_features, collapse = ", ")) } # Prepare model matrix with error handling formula <- as.formula(paste("~", paste(features, collapse = " + "), "- 1")) train_matrix <- tryCatch( model.matrix(formula, data = train_data), error = function(e) { stop("Failed to create model matrix: ", e$message) } ) if (all(train_data$TARGET %in% c(0, 1))) { # Classification message("Training classification model") params <- list( objective = "binary:logistic", eval_metric = "logloss", max_depth = 6, eta = 0.1 ) } else { # Regression message("Training regression model for backup duration") params <- list( objective = "reg:squarederror", eval_metric = "rmse", max_depth = 6, eta = 0.1 ) } model <- tryCatch( xgboost( data = train_matrix, label = train_data$TARGET, params = params, nrounds = 100, verbose = 0 ), error = function(e) { stop("Model training failed: ", e$message) } ) return(list( model = model, is_classification = all(train_data$TARGET %in% c(0, 1)), features = features ))}
# -----------------------------------------------------------------------------------# Comprehensive model evaluation# -----------------------------------------------------------------------------------evaluate_model <- function(model_result, test_data) { tryCatch({ features <- model_result$features missing_features <- setdiff(features, colnames(test_data)) if (length(missing_features) > 0) { stop("Missing features in test data: ", paste(missing_features, collapse = ", ")) } formula <- as.formula(paste("~", paste(features, collapse = " + "), "- 1")) test_matrix <- model.matrix(formula, data = test_data) predictions <- predict(model_result$model, test_matrix) if (model_result$is_classification) { # Classification metrics pred_classes <- as.factor(ifelse(predictions > 0.5, 1, 0)) true_labels <- as.factor(test_data$TARGET) levels(pred_classes) <- levels(true_labels) <- c("0", "1") conf_matrix <- caret::confusionMatrix(pred_classes, true_labels, positive = "1") return(list( type = "classification", metrics = conf_matrix, predictions = data.frame( Actual = test_data$TARGET, Predicted = predictions, Class = pred_classes ) )) } else { # Regression metrics actuals <- test_data$TARGET rmse <- sqrt(mean((actuals - predictions)^2)) mae <- mean(abs(actuals - predictions)) r_squared <- cor(actuals, predictions)^2 return(list( type = "regression", metrics = list(rmse = rmse, mae = mae, r_squared = r_squared), predictions = data.frame( Actual = actuals, Predicted = predictions ) )) } }, error = function(e) { warning("Model evaluation failed: ", e$message) return(NULL) })}
# -----------------------------------------------------------------------------------# Create visualizations for the report# -----------------------------------------------------------------------------------create_visualizations <- function(backup_data, analysis_data, model_eval) { plots <- list() # Backup success distribution if ("BACKUP_SUCCESS" %in% colnames(backup_data)) { success_plot <- ggplot(backup_data, aes(x = factor(BACKUP_SUCCESS), fill = factor(BACKUP_SUCCESS))) + geom_bar() + labs(title = "Backup Success Distribution", x = "Backup Status", y = "Count") + scale_fill_manual(values = c("1" = "green4", "0" = "red2"), labels = c("Failure", "Success")) + theme_minimal() plots$success_plot <- success_plot } # Backup duration by hour if ("BACKUP_HOUR" %in% colnames(backup_data) && "BACKUP_DURATION" %in% colnames(backup_data)) { duration_plot <- ggplot(backup_data, aes(x = BACKUP_HOUR, y = BACKUP_DURATION)) + geom_boxplot(aes(group = BACKUP_HOUR), fill = "lightblue") + labs(title = "Backup Duration by Hour of Day", x = "Hour of Day", y = "Duration (minutes)") + theme_minimal() plots$duration_plot <- duration_plot } # CPU usage vs backup success if ("AVG_CPU" %in% colnames(analysis_data) && "TARGET" %in% colnames(analysis_data)) { if (model_eval$type == "classification") { cpu_plot <- ggplot(analysis_data, aes(x = AVG_CPU, fill = factor(TARGET))) + geom_density(alpha = 0.5) + labs(title = "CPU Usage by Backup Status", x = "Average CPU Usage", y = "Density") + scale_fill_manual(values = c("1" = "green4", "0" = "red2"), labels = c("Failure", "Success")) + theme_minimal() } else { cpu_plot <- ggplot(analysis_data, aes(x = AVG_CPU, y = TARGET)) + geom_point(alpha = 0.5) + geom_smooth(method = "lm", color = "blue") + labs(title = "CPU Usage vs Backup Duration", x = "Average CPU Usage", y = "Duration (minutes)") + theme_minimal() } plots$cpu_plot <- cpu_plot } # Model performance visualization if (!is.null(model_eval)) { if (model_eval$type == "classification") { # Confusion matrix plot conf_data <- as.data.frame(model_eval$metrics$table) conf_plot <- ggplot(conf_data, aes(x = Reference, y = Prediction, fill = Freq)) + geom_tile() + geom_text(aes(label = Freq), color = "white") + scale_fill_gradient(low = "blue", high = "red") + labs(title = "Confusion Matrix", x = "Actual", y = "Predicted") + theme_minimal() plots$conf_plot <- conf_plot } else { # Regression prediction vs actual plot pred_plot <- ggplot(model_eval$predictions, aes(x = Actual, y = Predicted)) + geom_point(alpha = 0.5) + geom_abline(slope = 1, intercept = 0, color = "red") + labs(title = "Actual vs Predicted Backup Duration", subtitle = paste("RMSE:", round(model_eval$metrics$rmse, 2), "R-squared:", round(model_eval$metrics$r_squared, 2))) + theme_minimal() plots$pred_plot <- pred_plot } } return(plots)}
# -----------------------------------------------------------------------------------# Generate the PDF report# -----------------------------------------------------------------------------------generate_report <- function(backup_data, analysis_data, model_eval, plots, filename, output_format) { # Create a temporary Rmd file temp_rmd <- tempfile(fileext = ".Rmd") # Determine output format and set YAML accordingly if (output_format == "pdf") { yaml_output <- "pdf_document" } else { yaml_output <- "html_document" } # Write the Rmd content writeLines(c( "---", paste0("title: 'Database Backup Analysis Report'"), "author: 'Automated Analysis'", paste0("date: '", format(Sys.time(), "%B %d, %Y"), "'"), paste0("output: ", yaml_output), "---", "", "```{r setup, include=FALSE}", "knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)", "```", "", "# Database Backup Analysis Report", "", "## Data Overview", "", "```{r data-overview}", "cat(paste('Total backup records analyzed:', nrow(backup_data)))", "if ('BACKUP_SUCCESS' %in% colnames(backup_data)) {", " cat(paste('\\nSuccess rate:', round(mean(backup_data$BACKUP_SUCCESS) * 100, 1), '%'))", "}", "if ('BACKUP_DURATION' %in% colnames(backup_data)) {", " cat(paste('\\nAverage backup duration:', round(mean(backup_data$BACKUP_DURATION, na.rm = TRUE), 1), 'minutes'))", "}", "```", "", "## Visualizations", "", "```{r plots}", "if (!is.null(plots$success_plot)) print(plots$success_plot)", "if (!is.null(plots$duration_plot)) print(plots$duration_plot)", "if (!is.null(plots$cpu_plot)) print(plots$cpu_plot)", "```", "", "## Model Performance", "", "```{r model-performance}", "if (!is.null(model_eval)) {", " if (model_eval$type == 'classification') {", " cat('### Classification Performance\\n\\n')", " print(model_eval$metrics$table)", " cat(paste('\\n\\nAccuracy:', round(model_eval$metrics$overall['Accuracy'], 4)))", " if (!is.null(plots$conf_plot)) print(plots$conf_plot)", " } else {", " cat('### Regression Performance\\n\\n')", " cat(paste('RMSE:', round(model_eval$metrics$rmse, 2), 'minutes\\n'))", " cat(paste('MAE:', round(model_eval$metrics$mae, 2), 'minutes\\n'))", " cat(paste('R-squared:', round(model_eval$metrics$r_squared, 2), '\\n\\n'))", " if (!is.null(plots$pred_plot)) print(plots$pred_plot)", " }", "}", "```", "", "## Recommendations", "", "1. **Optimal Backup Timing**: Based on the analysis, consider scheduling backups during hours with historically higher success rates and shorter durations.", "2. **Resource Allocation**: Monitor CPU usage during backup windows and consider allocating additional resources if correlations are found.", "3. **Anomaly Detection**: Implement automated alerts for backup durations that fall outside expected ranges.", "", paste0("### Generated on ", format(Sys.time(), "%B %d, %Y at %H:%M:%S")) ), temp_rmd) # Render the report rmarkdown::render( input = temp_rmd, output_file = filename, quiet = TRUE ) # Clean up the temporary file unlink(temp_rmd)}
# -----------------------------------------------------------------------------------# Main execution function# -----------------------------------------------------------------------------------main <- function() { tryCatch({ # Check report format capability output_format <- check_pandoc() # Generate appropriate report filename report_filename <- generate_report_filename(output_format) # Connect to database conn <- connect_to_database() # Query backup data (this is the missing part) backup_query <- "SELECT DB_NAME, BACKUP_TYPE, COMPLETION_TIME, CHECK_DATE FROM ATTSTATS.BACKUP_STATUS" backup_data <- query_data(conn, backup_query) # Query CPU data cpu_query <- "SELECT DB_NAME, CPU_COUNT, TOTAL_CPU_USAGE, CHECK_DATE FROM ATTSTATS.CPU_USAGE" cpu_data <- query_data(conn, cpu_query) # Preprocess data backup_data <- preprocess_backup_data(backup_data) cpu_avg <- process_system_metrics(cpu_data) # Prepare analysis data backup_analysis <- prepare_analysis_data(backup_data, cpu_avg) # Split data for modeling set.seed(42) train_index <- createDataPartition(backup_analysis$TARGET, p = 0.8, list = FALSE) train_data <- backup_analysis[train_index, ] test_data <- backup_analysis[-train_index, ] # Train and evaluate model model_result <- train_model(train_data) model_eval <- evaluate_model(model_result, test_data) # Create visualizations plots <- create_visualizations(backup_data, backup_analysis, model_eval) # Generate report with the appropriate format generate_report(backup_data, backup_analysis, model_eval, plots, report_filename, output_format) message("\nReport generated successfully: ", report_filename) # Close database connection if (exists("conn") && dbIsValid(conn)) { dbDisconnect(conn) message("Database connection closed") } }, error = function(e) { message("Error in main execution: ", e$message) if (exists("conn") && dbIsValid(conn)) { dbDisconnect(conn) message("Database connection closed") } stop("Script execution failed") })}
# -----------------------------------------------------------------------------------# Execute the main function# -----------------------------------------------------------------------------------main()
# -----------------------------------------------------------------------------------# End of Script# -----------------------------------------------------------------------------------message("Script execution completed successfully.")# -----------------------------------------------------------------------------------
montbleau.ca Academic press
address
110 Rue Reid Vaudreuil-Dorion, Qc, Canada J7V 0G1
Mon-Sun 9 am - 7 pm
contact us
montbleau@hotmail.com
5149497697
follow us/
Copyright © 2024. All rights reserved. Powered by Domain.com.

We use cookies to enable essential functionality on our website, and analyze website traffic. By clicking Accept you consent to our use of cookies. Read about how we use cookies.

Your Cookie Settings

We use cookies to enable essential functionality on our website, and analyze website traffic. Read about how we use cookies.

Cookie Categories
Essential

These cookies are strictly necessary to provide you with services available through our websites. You cannot refuse these cookies without impacting how our websites function. You can block or delete them by changing your browser settings, as described under the heading "Managing cookies" in the Privacy and Cookies Policy.

Analytics

These cookies collect information that is used in aggregate form to help us understand how our websites are being used or how effective our marketing campaigns are.