7_1-predict_all_export_timing.r
# -----------------------------------------------------------------------------------# File Name : 7_1-Predict_All_export_timing.r# Author : Pierre Montbleau# Description : Oracle Database Export Time Prediction for All Schemas# Purpose : Analyze all database schemas, predict export times with 16 parallel # processes, generate PDF reports and Data Pump command templates# Call Syntax : source("F:\\DBA\\Scripts\\R\\7_1-Predict_All_export_timing.r")# Parameters : model_type (default="basic_16par") - Prediction model to use# Output directory configured internally (F:/DBA/Scripts/R/Reports)# Last Modified: 2024-01-15# -----------------------------------------------------------------------------------
# Load required librarieslibrary(DBI)library(odbc)library(dplyr)library(ggplot2)library(caret)library(lubridate) # For time formattinglibrary(gridExtra)library(grid)library(knitr)library(rmarkdown)library(scales)
# -----------------------------------------------------------------------------------# Database connection# -----------------------------------------------------------------------------------conn <- dbConnect(odbc::odbc(), dsn = "[DB_NAME]", uid = "[USERNAME]", pwd = "[PASSWORD]")
# -----------------------------------------------------------------------------------# Function to get ALL schemas from the database# -----------------------------------------------------------------------------------get_all_schemas <- function(conn) { # Try different privilege levels view_attempts <- list( list(name = "DBA", prefix = "dba_", table = "users"), list(name = "ALL", prefix = "all_", table = "users"), list(name = "USER", prefix = "user_", table = "users") ) for (attempt in view_attempts) { query <- paste0(" SELECT username as schema_name FROM ", attempt$prefix, attempt$table, " WHERE username NOT IN ( 'ANONYMOUS', 'APEX_PUBLIC_USER', 'APPQOSSYS', 'AUDSYS', 'CTXSYS', 'DBSNMP', 'DIP', 'DVF', 'DVSYS', 'EXFSYS', 'FLOWS_FILES', 'GSMADMIN_INTERNAL', 'GSMCATUSER', 'GSMUSER', 'LBACSYS', 'MDDATA', 'MDSYS', 'MGMT_VIEW', 'OLAPSYS', 'ORACLE_OCM', 'ORDDATA', 'ORDPLUGINS', 'ORDSYS', 'OUTLN', 'OWBSYS', 'REMOTE_SCHEDULER_AGENT', 'SI_INFORMTN_SCHEMA', 'SPATIAL_CSW_ADMIN_USR', 'SPATIAL_WFS_ADMIN_USR', 'SYS', 'SYSTEM', 'WMSYS', 'XDB', 'XS$NULL' ) ORDER BY username") tryCatch({ schemas <- dbGetQuery(conn, query) message("Successfully retrieved schemas using ", attempt$name, "_ views") return(schemas$SCHEMA_NAME) }, error = function(e) { message("Attempt with ", attempt$name, "_ views failed: ", conditionMessage(e)) }) } stop("All attempts to retrieve schemas failed.")}
# -----------------------------------------------------------------------------------# Function to get schema statistics with robust error handling# -----------------------------------------------------------------------------------get_schema_stats <- function(conn, schema_name) { # Try different privilege levels in order of preference view_attempts <- list( list(name = "DBA", prefix = "dba_"), list(name = "ALL", prefix = "all_"), list(name = "USER", prefix = "user_") ) for (attempt in view_attempts) { query <- paste0(" SELECT object_type, COUNT(*) as object_count, NVL(SUM(bytes)/1024/1024, 0) as size_mb FROM ", attempt$prefix, "objects o LEFT JOIN ", attempt$prefix, "segments s ON (o.owner = s.owner AND o.object_name = s.segment_name AND o.object_type = s.segment_type) WHERE o.owner = '", toupper(schema_name), "' GROUP BY object_type ORDER BY size_mb DESC NULLS LAST") tryCatch({ stats <- dbGetQuery(conn, query) # Normalize column names to lowercase names(stats) <- tolower(names(stats)) return(stats) }, error = function(e) { message("Attempt with ", attempt$name, "_ views for schema ", schema_name, " failed: ", conditionMessage(e)) }) } warning("All attempts to retrieve statistics for schema '", schema_name, "' failed.") return(NULL)}
# -----------------------------------------------------------------------------------# Function to convert minutes to HH:MM:SS format# -----------------------------------------------------------------------------------format_time_hhmmss <- function(total_minutes) { if (is.na(total_minutes) || total_minutes == 0) { return("00:00:00") } total_seconds <- round(total_minutes * 60) hours <- floor(total_seconds / 3600) minutes <- floor((total_seconds %% 3600) / 60) seconds <- total_seconds %% 60 sprintf("%02d:%02d:%02d", hours, minutes, seconds)}
# -----------------------------------------------------------------------------------# Function to estimate export time with 16 parallel processes# -----------------------------------------------------------------------------------estimate_export_time_16par <- function(schema_stats, model_type = "basic_16par") { # Check for empty results if (is.null(schema_stats) || nrow(schema_stats) == 0) { return(list( estimated_minutes = 0, formatted_time = "00:00:00", size_mb = 0, object_count = 0, model_used = "No data available", warning = "No schema statistics were returned", parallel_processes = 16, parallel_efficiency = 0.75, base_time_minutes = 0, complexity = "FAILED", estimated_dump_size_gb = 0 )) } if (model_type == "basic_16par") { # Enhanced model for 16 parallel processes total_size_mb <- sum(schema_stats$size_mb, na.rm = TRUE) total_objects <- sum(schema_stats$object_count, na.rm = TRUE) # Base factors (optimized for parallel processing) size_factor <- 0.5 # minutes per GB (reduced due to parallelism) object_factor <- 0.008 # minutes per 100 objects (reduced) parallel_efficiency <- 0.75 # 75% efficiency for 16 processes # Calculate base time size_time <- (total_size_mb / 1024) * size_factor object_time <- (total_objects / 100) * object_factor base_time <- size_time + object_time # Apply parallel efficiency (16 processes) parallel_time <- base_time / (16 * parallel_efficiency) # Minimum time for very small schemas parallel_time <- max(parallel_time, 0.5) # Complexity classification for parallel processing if (parallel_time < 1) { complexity <- "VERY SIMPLE" } else if (parallel_time < 5) { complexity <- "SIMPLE" } else if (parallel_time < 15) { complexity <- "MODERATE" } else if (parallel_time < 30) { complexity <- "COMPLEX" } else { complexity <- "VERY COMPLEX" } return(list( estimated_minutes = round(parallel_time, 2), formatted_time = format_time_hhmmss(parallel_time), size_mb = round(total_size_mb, 1), object_count = total_objects, model_used = "16 Parallel Processes Model", parallel_processes = 16, parallel_efficiency = parallel_efficiency, base_time_minutes = round(base_time, 2), complexity = complexity, estimated_dump_size_gb = round(total_size_mb / 1024 * 0.7, 3) # 70% compression )) } else if (model_type == "regression") { stop("Regression model not implemented in this example") }}
# -----------------------------------------------------------------------------------# Function to generate data pump command for individual schema with 16 parallel# -----------------------------------------------------------------------------------generate_schema_datapump_16par <- function(schema_name, estimation, db_name = "ORCL") { timestamp <- format(Sys.time(), "%Y%m%d_%H%M") # Determine optimal file size for schema if (estimation$size_mb > 102400) { # > 100GB filesize_gb <- 5 } else if (estimation$size_mb > 10240) { # > 10GB filesize_gb <- 2 } else if (estimation$size_mb > 1024) { # > 1GB filesize_gb <- 1 } else { filesize_gb <- 0.5 # 500MB } # Calculate number of files needed (minimum 2 for parallelism) num_files_needed <- ceiling(estimation$estimated_dump_size_gb / filesize_gb) num_files_needed <- max(2, min(num_files_needed, 8)) # Cap at 8 files per schema command <- paste0( "expdp system/<password> \\\n", " SCHEMAS=", schema_name, " \\\n", " DIRECTORY=DATA_PUMP_DIR \\\n", " DUMPFILE=", schema_name, "_", timestamp, "_%U.dmp \\\n", " FILESIZE=", filesize_gb, "G \\\n", " PARALLEL=", min(16, num_files_needed), " \\\n", " LOGFILE=", schema_name, "_export_", timestamp, ".log \\\n", " COMPRESSION=ALL \\\n", " EXCLUDE=STATISTICS \\\n", " JOB_NAME=", schema_name, "_EXP_", timestamp ) return(list( command = command, filesize_gb = filesize_gb, num_files = num_files_needed, actual_parallel = min(16, num_files_needed) ))}
# -----------------------------------------------------------------------------------# Function to generate PDF report for ALL schemas with 16 parallel focus# -----------------------------------------------------------------------------------generate_comprehensive_report_16par <- function(all_results, output_dir = "F:/DBA/Scripts/R/Reports") { # Create output directory if it doesn't exist if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE) } # Generate unique filename with timestamp timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S") report_file <- file.path(output_dir, paste0("All_Schemas_Export_16PAR_Estimation_", timestamp, ".pdf")) # Create temporary Rmd file temp_rmd <- tempfile(fileext = ".Rmd") # Filter out failed schemas valid_results <- all_results[sapply(all_results, function(x) !is.null(x$stats))] if (length(valid_results) == 0) { warning("No valid schema results to generate report") return(NULL) } # Prepare data for summary table summary_data <- data.frame( Schema = names(valid_results), Size_MB = sapply(valid_results, function(x) x$estimation$size_mb), Objects = sapply(valid_results, function(x) x$estimation$object_count), Estimated_Minutes = sapply(valid_results, function(x) x$estimation$estimated_minutes), Formatted_Time = sapply(valid_results, function(x) x$estimation$formatted_time), Complexity = sapply(valid_results, function(x) x$estimation$complexity), Base_Time_Minutes = sapply(valid_results, function(x) x$estimation$base_time_minutes), Est_Dump_Size_GB = sapply(valid_results, function(x) x$estimation$estimated_dump_size_gb), stringsAsFactors = FALSE ) # Order by estimated time (descending) summary_data <- summary_data[order(-summary_data$Estimated_Minutes), ] # Calculate totals total_size <- sum(summary_data$Size_MB, na.rm = TRUE) total_objects <- sum(summary_data$Objects, na.rm = TRUE) total_time_parallel <- sum(summary_data$Estimated_Minutes, na.rm = TRUE) total_time_base <- sum(summary_data$Base_Time_Minutes, na.rm = TRUE) # Calculate time savings time_savings_percent <- if (total_time_base > 0) { round((1 - total_time_parallel / total_time_base) * 100, 1) } else { 0 } # Calculate complexity distribution complexity_dist <- table(summary_data$Complexity) # Write Rmd content writeLines(c( "---", "title: 'Oracle ALL Schemas Export Time Estimation Report (16 Parallel Processes)'", "author: 'Automated DBA Report'", "date: '`r Sys.Date()`'", "output: pdf_document", "---", "", "```{r setup, include=FALSE}", "knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)", "library(ggplot2)", "library(dplyr)", "library(scales)", "```", "", "# Oracle ALL Schemas Export Time Estimation - 16 Parallel Processes", "", "## Report Summary", paste0("- **Total Schemas Analyzed:** ", nrow(summary_data)), paste0("- **Total Size:** ", round(total_size, 1), " MB (", round(total_size/1024, 1), " GB)"), paste0("- **Total Objects:** ", format(total_objects, big.mark = ",")), paste0("- **Total Estimated Export Time (16 PAR):** ", format_time_hhmmss(total_time_parallel)), paste0("- **Base Time (Sequential):** ", format_time_hhmmss(total_time_base)), paste0("- **Time Savings:** ", time_savings_percent, "% faster with 16 parallel processes"), paste0("- **Parallel Efficiency Factor:** 75%"), paste0("- **Report Generated:** ", Sys.time()), "", "## Performance Overview", "", "### Complexity Distribution", "```{r complexity-dist}", "complexity_df <- as.data.frame(complexity_dist)", "names(complexity_df) <- c('Complexity', 'Count')", "knitr::kable(complexity_df, caption = 'Schema Complexity Distribution')", "```", "", "### Time Savings Analysis", "```{r time-savings, fig.height=6, fig.width=10}", "savings_data <- data.frame(", " Method = c('Sequential', '16 Parallel'),", " Time_Hours = c(total_time_base/60, total_time_parallel/60)", ")", "", "ggplot(savings_data, aes(x = Method, y = Time_Hours, fill = Method)) +", " geom_bar(stat = 'identity') +", " labs(title = 'Total Export Time Comparison',", " x = 'Export Method',", " y = 'Time (Hours)') +", " theme_minimal() +", " geom_text(aes(label = paste0(round(Time_Hours, 1), ' hours')), vjust = -0.5) +", " scale_fill_manual(values = c('Sequential' = 'red', '16 Parallel' = 'green'))", "```", "", "## Export Time Summary for All Schemas (16 Parallel)", "", "```{r summary-table}", "display_data <- summary_data %>%", " select(Schema, Size_MB, Objects, Estimated_Minutes, Formatted_Time, Complexity)", "knitr::kable(display_data, caption = 'Export Time Estimation with 16 Parallel Processes')", "```", "", "## Visualizations", "", "### Top 10 Schemas by Estimated Export Time (16 PAR)", "```{r top10-plot, fig.height=6, fig.width=10}", "top_10 <- summary_data %>%", " arrange(desc(Estimated_Minutes)) %>%", " head(10)", "", "ggplot(top_10, aes(x = reorder(Schema, Estimated_Minutes), y = Estimated_Minutes, fill = Complexity)) +", " geom_bar(stat = 'identity') +", " coord_flip() +", " labs(title = 'Top 10 Schemas by Estimated Export Time (16 Parallel)',", " x = 'Schema',", " y = 'Estimated Time (minutes)') +", " theme_minimal() +", " theme(legend.position = 'bottom') +", " scale_y_continuous(labels = scales::comma)", "```", "", "### Size vs Objects Scatter Plot with Complexity", "```{r scatter-plot, fig.height=6, fig.width=10}", "ggplot(summary_data, aes(x = Size_MB, y = Objects, size = Estimated_Minutes, color = Complexity)) +", " geom_point(alpha = 0.7) +", " geom_text(aes(label = ifelse(Estimated_Minutes > 10, Schema, '')), ", " size = 3, hjust = 0.5, vjust = -0.5, check_overlap = TRUE) +", " labs(title = 'Schema Size vs Number of Objects (16 Parallel)',", " x = 'Size (MB)',", " y = 'Number of Objects',", " size = 'Est. Time (min)',", " color = 'Complexity') +", " theme_minimal() +", " scale_x_log10(labels = scales::comma) +", " scale_y_log10(labels = scales::comma)", "```", "", "### Distribution of Estimated Times (16 PAR)", "```{r distribution-plot, fig.height=6, fig.width=10}", "ggplot(summary_data, aes(x = Estimated_Minutes, fill = Complexity)) +", " geom_histogram(binwidth = 1, alpha = 0.7) +", " labs(title = 'Distribution of Estimated Export Times (16 Parallel)',", " x = 'Estimated Time (minutes)',", " y = 'Number of Schemas') +", " theme_minimal() +", " theme(legend.position = 'bottom') +", " scale_x_continuous(labels = scales::comma)", "```", "", "## Data Pump Command Templates for Top Schemas", "", "### Recommended Parallel Export Strategy", "For optimal performance with 16 parallel processes:", "", "1. **Group small schemas** (Estimated time < 1 minute) into batches", "2. **Export medium schemas** (1-5 minutes) individually with reduced parallelism", "3. **Large schemas** (>5 minutes) benefit most from full 16 parallel processes", "", "```{r datapump-examples, results='asis'}", "cat('### Example Data Pump Commands for Different Schema Sizes\\n\\n')", "", "# Get top 3 largest schemas", "top_3 <- head(summary_data, 3)", "", "for(i in 1:min(3, nrow(summary_data))) {", " schema <- top_3$Schema[i]", " est <- valid_results[[schema]]$estimation", " dp_info <- generate_schema_datapump_16par(schema, est)", " ", " cat('#### ', schema, ' (', est$complexity, ')\\n')", " cat('- **Size:** ', est$size_mb, ' MB\\n')", " cat('- **Estimated Time:** ', est$formatted_time, '\\n')", " cat('- **Recommended Parallelism:** ', dp_info$actual_parallel, '\\n')", " cat('- **File Size:** ', dp_info$filesize_gb, ' GB\\n')", " cat('- **Number of Files:** ', dp_info$num_files, '\\n')", " cat('```bash\\n', dp_info$command, '\\n```\\n\\n')", "}", "```", "", "## Batch Export Recommendations", "", "Based on the analysis, schemas can be grouped for optimal parallel export:", "", "```{r batch-recommendations}", "# Create batch groups", "batch_groups <- summary_data %>%", " mutate(Batch_Group = case_when(", " Estimated_Minutes < 1 ~ 'Batch 1 (Very Fast)',", " Estimated_Minutes < 5 ~ 'Batch 2 (Fast)',", " Estimated_Minutes < 15 ~ 'Batch 3 (Medium)',", " Estimated_Minutes < 30 ~ 'Batch 4 (Large)',", " TRUE ~ 'Batch 5 (Very Large)'", " )) %>%", " group_by(Batch_Group) %>%", " summarise(", " Count = n(),", " Total_Size_MB = sum(Size_MB),", " Total_Time_Min = sum(Estimated_Minutes),", " Avg_Time_Min = mean(Estimated_Minutes)", " ) %>%", " arrange(Total_Time_Min)", "", "knitr::kable(batch_groups, caption = 'Recommended Batch Groups for Parallel Export')", "```", "", "## Performance Considerations for 16 Parallel Exports", "", "### Advantages:", "1. **Maximum throughput** for large schemas", "2. **Reduced wall-clock time** for batch exports", "3. **Better resource utilization** of modern multi-core servers", "", "### Requirements:", "1. **Sufficient I/O bandwidth** (minimum 200 MB/s per parallel process)", "2. **Adequate PGA memory** for parallel query slaves", "3. **Fast storage subsystem** (SSD recommended)", "4. **Properly configured TEMP tablespace**", "", "### Monitoring During Export:", "```sql", "-- Monitor parallel processes", "SELECT * FROM v$pq_sesstat;", "", "-- Monitor active Data Pump jobs", "SELECT * FROM dba_datapump_jobs;", "", "-- Check I/O statistics", "SELECT * FROM v$iostat_file;", "```", "", "## Risk Assessment for Parallel Exports", "", "### High Risk Schemas:", "- Schemas with **LOB columns** may not benefit fully from parallelism", "- Schemas with **many constraints** may experience serialization", "- **Very small schemas** (< 100MB) may have overhead > benefit", "", "### Recommended Actions:", "1. **Pre-export testing** for complex schemas", "2. **Monitor PGA usage** during first few exports", "3. **Adjust parallelism** based on actual performance", "4. **Schedule exports** during off-peak hours", "", "## Storage Requirements for Parallel Exports", "", "| Requirement | Calculation | Notes |", "|-------------|-------------|-------|", paste0("| Dump Files | ", round(sum(summary_data$Est_Dump_Size_GB), 1), " GB | Total compressed size |"), paste0("| Temp Space | ", round(total_size/1024 * 0.1, 1), " GB | 10% of total size for sorting |"), "| Log Files | 1-3 GB | Per parallel export job |", paste0("| **Total Required** | **", round(sum(summary_data$Est_Dump_Size_GB) + total_size/1024 * 0.1 + 3, 1), " GB** | Minimum storage requirement |"), "", "## Schedule Optimization", "", paste0("Based on total estimated time of ", format_time_hhmmss(total_time_parallel), ":"), "", "- **Recommended batch size:** 4-6 schemas concurrently", "- **Optimal concurrency level:** 75% of available CPU cores", "- **Memory requirement:** 2GB PGA per parallel process", "- **Monitoring frequency:** Every 15 minutes during exports", "", "## Notes", "- This report provides estimates for **16 parallel process exports**", "- System schemas are excluded from analysis", "- Estimated times include **75% parallel efficiency factor**", "- Actual export times may vary based on:", " - I/O subsystem performance", " - Database version and configuration", " - System load during export", " - Network bandwidth to target", "- **Parallel exports require careful monitoring** to avoid resource contention", "- **Test exports recommended** for production-critical schemas", "- Report generated by automated DBA script (16 Parallel Edition)" ), temp_rmd) # Render the report tryCatch({ rmarkdown::render(temp_rmd, output_file = report_file, quiet = TRUE) message("Comprehensive report saved to: ", report_file) }, error = function(e) { warning("Failed to render PDF report: ", conditionMessage(e)) # Try HTML as fallback report_file_html <- file.path(output_dir, paste0("All_Schemas_Export_16PAR_Estimation_", timestamp, ".html")) writeLines(gsub("output: pdf_document", "output: html_document", readLines(temp_rmd)), temp_rmd) rmarkdown::render(temp_rmd, output_file = report_file_html, quiet = TRUE) report_file <- report_file_html message("HTML report saved to: ", report_file) }) # Clean up temporary file unlink(temp_rmd) return(report_file)}
# -----------------------------------------------------------------------------------# Main function to predict export time for ALL schemas with 16 parallel# -----------------------------------------------------------------------------------predict_all_schemas_export_time_16par <- function(conn, model_type = "basic_16par") { # Get all schemas message("Retrieving all schemas from database...") all_schemas <- get_all_schemas(conn) message("Found ", length(all_schemas), " schemas to analyze") # Initialize results list results <- list() # Process each schema progress <- 0 for (schema in all_schemas) { progress <- progress + 1 message(sprintf("[%03d/%03d] Processing schema: %s", progress, length(all_schemas), schema)) tryCatch({ # Get schema statistics schema_stats <- get_schema_stats(conn, schema) # Estimate export time with 16 parallel processes estimation <- estimate_export_time_16par(schema_stats, model_type) # Generate data pump command for this schema datapump_info <- generate_schema_datapump_16par(schema, estimation) # Store results results[[schema]] <- list( schema_name = schema, estimation = estimation, datapump_info = datapump_info, stats = schema_stats ) message(sprintf(" - Size: %7.1f MB, Objects: %6d, Time: %s (%s)", estimation$size_mb, estimation$object_count, estimation$formatted_time, estimation$complexity)) }, error = function(e) { warning("Failed to process schema ", schema, ": ", conditionMessage(e)) # Store empty result for failed schemas results[[schema]] <- list( schema_name = schema, estimation = list( estimated_minutes = 0, formatted_time = "00:00:00", size_mb = 0, object_count = 0, model_used = "Failed to analyze", parallel_processes = 16, parallel_efficiency = 0.75, base_time_minutes = 0, complexity = "FAILED", estimated_dump_size_gb = 0, warning = conditionMessage(e) ), datapump_info = NULL, stats = NULL ) }) # Small delay to avoid overwhelming the database Sys.sleep(0.05) } # Generate comprehensive PDF report message("\nGenerating comprehensive report for 16 parallel processes...") report_path <- generate_comprehensive_report_16par(results) # Calculate summary statistics successful_schemas <- sum(sapply(results, function(x) !is.null(x$stats))) total_parallel_time <- sum(sapply(results, function(x) x$estimation$estimated_minutes)) total_base_time <- sum(sapply(results, function(x) x$estimation$base_time_minutes)) # Return results return(list( all_results = results, report_path = report_path, total_schemas = length(all_schemas), processed_schemas = successful_schemas, summary_stats = list( total_parallel_time_minutes = total_parallel_time, total_base_time_minutes = total_base_time, time_savings_percent = if (total_base_time > 0) { round((1 - total_parallel_time / total_base_time) * 100, 1) } else { 0 }, avg_parallel_time_per_schema = if (successful_schemas > 0) { round(total_parallel_time / successful_schemas, 2) } else { 0 } ) ))}
# -----------------------------------------------------------------------------------# Execute the analysis for ALL schemas with 16 parallel processes# -----------------------------------------------------------------------------------cat(paste0("\n", strrep("=", 80), "\n"))cat("STARTING EXPORT TIME ESTIMATION FOR ALL DATABASE SCHEMAS\n")cat("OPTIMIZED FOR 16 PARALLEL PROCESSES\n")cat(paste0(strrep("=", 80), "\n"))
all_results <- tryCatch({ predict_all_schemas_export_time_16par(conn, model_type = "basic_16par")}, error = function(e) { cat("Analysis failed: ", conditionMessage(e), "\n") NULL})
# Close connection when donedbDisconnect(conn)
# -----------------------------------------------------------------------------------# Output results and summary# -----------------------------------------------------------------------------------if (!is.null(all_results)) { # Print summary cat(paste0("\n", strrep("=", 80), "\n")) cat("ANALYSIS COMPLETE - 16 PARALLEL PROCESSES\n") cat(paste0(strrep("=", 80), "\n")) cat("Total schemas analyzed:", all_results$total_schemas, "\n") cat("Successfully processed:", all_results$processed_schemas, "\n") cat("Total estimated time (16 PAR):", format_time_hhmmss(all_results$summary_stats$total_parallel_time_minutes), "\n") cat("Base time (sequential):", format_time_hhmmss(all_results$summary_stats$total_base_time_minutes), "\n") cat("Time savings:", all_results$summary_stats$time_savings_percent, "% faster\n") cat("Average time per schema:", round(all_results$summary_stats$avg_parallel_time_per_schema, 2), "minutes\n") cat("Comprehensive report saved to:", all_results$report_path, "\n\n") # Print top 10 schemas by estimated time if (length(all_results$all_results) > 0) { # Filter out failed schemas for display valid_display <- all_results$all_results[sapply(all_results$all_results, function(x) !is.null(x$stats))] if (length(valid_display) > 0) { # Extract summary data summary_list <- lapply(names(valid_display), function(schema) { est <- valid_display[[schema]]$estimation c( Schema = schema, Size_MB = est$size_mb, Objects = est$object_count, Minutes = est$estimated_minutes, Time = est$formatted_time, Complexity = est$complexity, Parallel_Saving = if (est$base_time_minutes > 0 && est$estimated_minutes > 0) { round((1 - est$estimated_minutes / est$base_time_minutes) * 100, 1) } else { 0 } ) }) summary_df <- as.data.frame(do.call(rbind, summary_list)) summary_df$Size_MB <- as.numeric(summary_df$Size_MB) summary_df$Objects <- as.numeric(summary_df$Objects) summary_df$Minutes <- as.numeric(summary_df$Minutes) summary_df$Parallel_Saving <- as.numeric(summary_df$Parallel_Saving) # Order by estimated time summary_df <- summary_df[order(-summary_df$Minutes), ] cat("TOP 10 SCHEMAS BY ESTIMATED EXPORT TIME (16 PARALLEL):\n") cat(paste0(strrep("-", 100), "\n")) cat(sprintf("%-25s %10s %10s %10s %12s %15s %12s\n", "SCHEMA", "SIZE_MB", "OBJECTS", "MINUTES", "TIME", "COMPLEXITY", "PAR_SAVING")) cat(paste0(strrep("-", 100), "\n")) for (i in 1:min(10, nrow(summary_df))) { cat(sprintf("%-25s %10.1f %10d %10.2f %12s %15s %11.1f%%\n", summary_df$Schema[i], summary_df$Size_MB[i], summary_df$Objects[i], summary_df$Minutes[i], summary_df$Time[i], summary_df$Complexity[i], summary_df$Parallel_Saving[i])) } # Print batch recommendations cat(paste0("\n", strrep("=", 80), "\n")) cat("BATCH EXPORT RECOMMENDATIONS FOR 16 PARALLEL PROCESSES\n") cat(paste0(strrep("=", 80), "\n")) # Group schemas by complexity/time fast_schemas <- summary_df[summary_df$Minutes < 1, ] medium_schemas <- summary_df[summary_df$Minutes >= 1 & summary_df$Minutes < 5, ] large_schemas <- summary_df[summary_df$Minutes >= 5 & summary_df$Minutes < 15, ] xlarge_schemas <- summary_df[summary_df$Minutes >= 15, ] cat(sprintf("Fast schemas (<1 min): %d schemas, total time: %.1f min\n", nrow(fast_schemas), sum(fast_schemas$Minutes, na.rm = TRUE))) cat(" Recommended: Export in batches of 8-10 schemas concurrently\n\n") cat(sprintf("Medium schemas (1-5 min): %d schemas, total time: %.1f min\n", nrow(medium_schemas), sum(medium_schemas$Minutes, na.rm = TRUE))) cat(" Recommended: Export in batches of 4-6 schemas concurrently\n\n") cat(sprintf("Large schemas (5-15 min): %d schemas, total time: %.1f min\n", nrow(large_schemas), sum(large_schemas$Minutes, na.rm = TRUE))) cat(" Recommended: Export individually or 2-3 concurrently\n\n") cat(sprintf("Extra Large schemas (>15 min): %d schemas, total time: %.1f min\n", nrow(xlarge_schemas), sum(xlarge_schemas$Minutes, na.rm = TRUE))) cat(" Recommended: Export individually with full 16 parallelism\n\n") cat("Total estimated batch time (optimized):", format_time_hhmmss(sum(fast_schemas$Minutes)/10 + sum(medium_schemas$Minutes)/5 + sum(large_schemas$Minutes)/2 + sum(xlarge_schemas$Minutes)), "\n") } } # Save detailed results to RDS rds_file <- paste0("all_schemas_16par_analysis_", format(Sys.time(), "%Y%m%d_%H%M"), ".rds") saveRDS(all_results, file = rds_file) cat("\nDetailed results saved to RDS file:", rds_file, "\n")}
# -----------------------------------------------------------------------------------# End of Script# -----------------------------------------------------------------------------------cat(paste0("\n", strrep("=", 80), "\n"))cat("ANALYSIS COMPLETED AT:", Sys.time(), "\n")cat(paste0(strrep("=", 80), "\n"))message("All schemas export prediction script execution completed successfully.")# -----------------------------------------------------------------------------------
# Load required librarieslibrary(DBI)library(odbc)library(dplyr)library(ggplot2)library(caret)library(lubridate) # For time formattinglibrary(gridExtra)library(grid)library(knitr)library(rmarkdown)library(scales)
# -----------------------------------------------------------------------------------# Database connection# -----------------------------------------------------------------------------------conn <- dbConnect(odbc::odbc(), dsn = "[DB_NAME]", uid = "[USERNAME]", pwd = "[PASSWORD]")
# -----------------------------------------------------------------------------------# Function to get ALL schemas from the database# -----------------------------------------------------------------------------------get_all_schemas <- function(conn) { # Try different privilege levels view_attempts <- list( list(name = "DBA", prefix = "dba_", table = "users"), list(name = "ALL", prefix = "all_", table = "users"), list(name = "USER", prefix = "user_", table = "users") ) for (attempt in view_attempts) { query <- paste0(" SELECT username as schema_name FROM ", attempt$prefix, attempt$table, " WHERE username NOT IN ( 'ANONYMOUS', 'APEX_PUBLIC_USER', 'APPQOSSYS', 'AUDSYS', 'CTXSYS', 'DBSNMP', 'DIP', 'DVF', 'DVSYS', 'EXFSYS', 'FLOWS_FILES', 'GSMADMIN_INTERNAL', 'GSMCATUSER', 'GSMUSER', 'LBACSYS', 'MDDATA', 'MDSYS', 'MGMT_VIEW', 'OLAPSYS', 'ORACLE_OCM', 'ORDDATA', 'ORDPLUGINS', 'ORDSYS', 'OUTLN', 'OWBSYS', 'REMOTE_SCHEDULER_AGENT', 'SI_INFORMTN_SCHEMA', 'SPATIAL_CSW_ADMIN_USR', 'SPATIAL_WFS_ADMIN_USR', 'SYS', 'SYSTEM', 'WMSYS', 'XDB', 'XS$NULL' ) ORDER BY username") tryCatch({ schemas <- dbGetQuery(conn, query) message("Successfully retrieved schemas using ", attempt$name, "_ views") return(schemas$SCHEMA_NAME) }, error = function(e) { message("Attempt with ", attempt$name, "_ views failed: ", conditionMessage(e)) }) } stop("All attempts to retrieve schemas failed.")}
# -----------------------------------------------------------------------------------# Function to get schema statistics with robust error handling# -----------------------------------------------------------------------------------get_schema_stats <- function(conn, schema_name) { # Try different privilege levels in order of preference view_attempts <- list( list(name = "DBA", prefix = "dba_"), list(name = "ALL", prefix = "all_"), list(name = "USER", prefix = "user_") ) for (attempt in view_attempts) { query <- paste0(" SELECT object_type, COUNT(*) as object_count, NVL(SUM(bytes)/1024/1024, 0) as size_mb FROM ", attempt$prefix, "objects o LEFT JOIN ", attempt$prefix, "segments s ON (o.owner = s.owner AND o.object_name = s.segment_name AND o.object_type = s.segment_type) WHERE o.owner = '", toupper(schema_name), "' GROUP BY object_type ORDER BY size_mb DESC NULLS LAST") tryCatch({ stats <- dbGetQuery(conn, query) # Normalize column names to lowercase names(stats) <- tolower(names(stats)) return(stats) }, error = function(e) { message("Attempt with ", attempt$name, "_ views for schema ", schema_name, " failed: ", conditionMessage(e)) }) } warning("All attempts to retrieve statistics for schema '", schema_name, "' failed.") return(NULL)}
# -----------------------------------------------------------------------------------# Function to convert minutes to HH:MM:SS format# -----------------------------------------------------------------------------------format_time_hhmmss <- function(total_minutes) { if (is.na(total_minutes) || total_minutes == 0) { return("00:00:00") } total_seconds <- round(total_minutes * 60) hours <- floor(total_seconds / 3600) minutes <- floor((total_seconds %% 3600) / 60) seconds <- total_seconds %% 60 sprintf("%02d:%02d:%02d", hours, minutes, seconds)}
# -----------------------------------------------------------------------------------# Function to estimate export time with 16 parallel processes# -----------------------------------------------------------------------------------estimate_export_time_16par <- function(schema_stats, model_type = "basic_16par") { # Check for empty results if (is.null(schema_stats) || nrow(schema_stats) == 0) { return(list( estimated_minutes = 0, formatted_time = "00:00:00", size_mb = 0, object_count = 0, model_used = "No data available", warning = "No schema statistics were returned", parallel_processes = 16, parallel_efficiency = 0.75, base_time_minutes = 0, complexity = "FAILED", estimated_dump_size_gb = 0 )) } if (model_type == "basic_16par") { # Enhanced model for 16 parallel processes total_size_mb <- sum(schema_stats$size_mb, na.rm = TRUE) total_objects <- sum(schema_stats$object_count, na.rm = TRUE) # Base factors (optimized for parallel processing) size_factor <- 0.5 # minutes per GB (reduced due to parallelism) object_factor <- 0.008 # minutes per 100 objects (reduced) parallel_efficiency <- 0.75 # 75% efficiency for 16 processes # Calculate base time size_time <- (total_size_mb / 1024) * size_factor object_time <- (total_objects / 100) * object_factor base_time <- size_time + object_time # Apply parallel efficiency (16 processes) parallel_time <- base_time / (16 * parallel_efficiency) # Minimum time for very small schemas parallel_time <- max(parallel_time, 0.5) # Complexity classification for parallel processing if (parallel_time < 1) { complexity <- "VERY SIMPLE" } else if (parallel_time < 5) { complexity <- "SIMPLE" } else if (parallel_time < 15) { complexity <- "MODERATE" } else if (parallel_time < 30) { complexity <- "COMPLEX" } else { complexity <- "VERY COMPLEX" } return(list( estimated_minutes = round(parallel_time, 2), formatted_time = format_time_hhmmss(parallel_time), size_mb = round(total_size_mb, 1), object_count = total_objects, model_used = "16 Parallel Processes Model", parallel_processes = 16, parallel_efficiency = parallel_efficiency, base_time_minutes = round(base_time, 2), complexity = complexity, estimated_dump_size_gb = round(total_size_mb / 1024 * 0.7, 3) # 70% compression )) } else if (model_type == "regression") { stop("Regression model not implemented in this example") }}
# -----------------------------------------------------------------------------------# Function to generate data pump command for individual schema with 16 parallel# -----------------------------------------------------------------------------------generate_schema_datapump_16par <- function(schema_name, estimation, db_name = "ORCL") { timestamp <- format(Sys.time(), "%Y%m%d_%H%M") # Determine optimal file size for schema if (estimation$size_mb > 102400) { # > 100GB filesize_gb <- 5 } else if (estimation$size_mb > 10240) { # > 10GB filesize_gb <- 2 } else if (estimation$size_mb > 1024) { # > 1GB filesize_gb <- 1 } else { filesize_gb <- 0.5 # 500MB } # Calculate number of files needed (minimum 2 for parallelism) num_files_needed <- ceiling(estimation$estimated_dump_size_gb / filesize_gb) num_files_needed <- max(2, min(num_files_needed, 8)) # Cap at 8 files per schema command <- paste0( "expdp system/<password> \\\n", " SCHEMAS=", schema_name, " \\\n", " DIRECTORY=DATA_PUMP_DIR \\\n", " DUMPFILE=", schema_name, "_", timestamp, "_%U.dmp \\\n", " FILESIZE=", filesize_gb, "G \\\n", " PARALLEL=", min(16, num_files_needed), " \\\n", " LOGFILE=", schema_name, "_export_", timestamp, ".log \\\n", " COMPRESSION=ALL \\\n", " EXCLUDE=STATISTICS \\\n", " JOB_NAME=", schema_name, "_EXP_", timestamp ) return(list( command = command, filesize_gb = filesize_gb, num_files = num_files_needed, actual_parallel = min(16, num_files_needed) ))}
# -----------------------------------------------------------------------------------# Function to generate PDF report for ALL schemas with 16 parallel focus# -----------------------------------------------------------------------------------generate_comprehensive_report_16par <- function(all_results, output_dir = "F:/DBA/Scripts/R/Reports") { # Create output directory if it doesn't exist if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE) } # Generate unique filename with timestamp timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S") report_file <- file.path(output_dir, paste0("All_Schemas_Export_16PAR_Estimation_", timestamp, ".pdf")) # Create temporary Rmd file temp_rmd <- tempfile(fileext = ".Rmd") # Filter out failed schemas valid_results <- all_results[sapply(all_results, function(x) !is.null(x$stats))] if (length(valid_results) == 0) { warning("No valid schema results to generate report") return(NULL) } # Prepare data for summary table summary_data <- data.frame( Schema = names(valid_results), Size_MB = sapply(valid_results, function(x) x$estimation$size_mb), Objects = sapply(valid_results, function(x) x$estimation$object_count), Estimated_Minutes = sapply(valid_results, function(x) x$estimation$estimated_minutes), Formatted_Time = sapply(valid_results, function(x) x$estimation$formatted_time), Complexity = sapply(valid_results, function(x) x$estimation$complexity), Base_Time_Minutes = sapply(valid_results, function(x) x$estimation$base_time_minutes), Est_Dump_Size_GB = sapply(valid_results, function(x) x$estimation$estimated_dump_size_gb), stringsAsFactors = FALSE ) # Order by estimated time (descending) summary_data <- summary_data[order(-summary_data$Estimated_Minutes), ] # Calculate totals total_size <- sum(summary_data$Size_MB, na.rm = TRUE) total_objects <- sum(summary_data$Objects, na.rm = TRUE) total_time_parallel <- sum(summary_data$Estimated_Minutes, na.rm = TRUE) total_time_base <- sum(summary_data$Base_Time_Minutes, na.rm = TRUE) # Calculate time savings time_savings_percent <- if (total_time_base > 0) { round((1 - total_time_parallel / total_time_base) * 100, 1) } else { 0 } # Calculate complexity distribution complexity_dist <- table(summary_data$Complexity) # Write Rmd content writeLines(c( "---", "title: 'Oracle ALL Schemas Export Time Estimation Report (16 Parallel Processes)'", "author: 'Automated DBA Report'", "date: '`r Sys.Date()`'", "output: pdf_document", "---", "", "```{r setup, include=FALSE}", "knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)", "library(ggplot2)", "library(dplyr)", "library(scales)", "```", "", "# Oracle ALL Schemas Export Time Estimation - 16 Parallel Processes", "", "## Report Summary", paste0("- **Total Schemas Analyzed:** ", nrow(summary_data)), paste0("- **Total Size:** ", round(total_size, 1), " MB (", round(total_size/1024, 1), " GB)"), paste0("- **Total Objects:** ", format(total_objects, big.mark = ",")), paste0("- **Total Estimated Export Time (16 PAR):** ", format_time_hhmmss(total_time_parallel)), paste0("- **Base Time (Sequential):** ", format_time_hhmmss(total_time_base)), paste0("- **Time Savings:** ", time_savings_percent, "% faster with 16 parallel processes"), paste0("- **Parallel Efficiency Factor:** 75%"), paste0("- **Report Generated:** ", Sys.time()), "", "## Performance Overview", "", "### Complexity Distribution", "```{r complexity-dist}", "complexity_df <- as.data.frame(complexity_dist)", "names(complexity_df) <- c('Complexity', 'Count')", "knitr::kable(complexity_df, caption = 'Schema Complexity Distribution')", "```", "", "### Time Savings Analysis", "```{r time-savings, fig.height=6, fig.width=10}", "savings_data <- data.frame(", " Method = c('Sequential', '16 Parallel'),", " Time_Hours = c(total_time_base/60, total_time_parallel/60)", ")", "", "ggplot(savings_data, aes(x = Method, y = Time_Hours, fill = Method)) +", " geom_bar(stat = 'identity') +", " labs(title = 'Total Export Time Comparison',", " x = 'Export Method',", " y = 'Time (Hours)') +", " theme_minimal() +", " geom_text(aes(label = paste0(round(Time_Hours, 1), ' hours')), vjust = -0.5) +", " scale_fill_manual(values = c('Sequential' = 'red', '16 Parallel' = 'green'))", "```", "", "## Export Time Summary for All Schemas (16 Parallel)", "", "```{r summary-table}", "display_data <- summary_data %>%", " select(Schema, Size_MB, Objects, Estimated_Minutes, Formatted_Time, Complexity)", "knitr::kable(display_data, caption = 'Export Time Estimation with 16 Parallel Processes')", "```", "", "## Visualizations", "", "### Top 10 Schemas by Estimated Export Time (16 PAR)", "```{r top10-plot, fig.height=6, fig.width=10}", "top_10 <- summary_data %>%", " arrange(desc(Estimated_Minutes)) %>%", " head(10)", "", "ggplot(top_10, aes(x = reorder(Schema, Estimated_Minutes), y = Estimated_Minutes, fill = Complexity)) +", " geom_bar(stat = 'identity') +", " coord_flip() +", " labs(title = 'Top 10 Schemas by Estimated Export Time (16 Parallel)',", " x = 'Schema',", " y = 'Estimated Time (minutes)') +", " theme_minimal() +", " theme(legend.position = 'bottom') +", " scale_y_continuous(labels = scales::comma)", "```", "", "### Size vs Objects Scatter Plot with Complexity", "```{r scatter-plot, fig.height=6, fig.width=10}", "ggplot(summary_data, aes(x = Size_MB, y = Objects, size = Estimated_Minutes, color = Complexity)) +", " geom_point(alpha = 0.7) +", " geom_text(aes(label = ifelse(Estimated_Minutes > 10, Schema, '')), ", " size = 3, hjust = 0.5, vjust = -0.5, check_overlap = TRUE) +", " labs(title = 'Schema Size vs Number of Objects (16 Parallel)',", " x = 'Size (MB)',", " y = 'Number of Objects',", " size = 'Est. Time (min)',", " color = 'Complexity') +", " theme_minimal() +", " scale_x_log10(labels = scales::comma) +", " scale_y_log10(labels = scales::comma)", "```", "", "### Distribution of Estimated Times (16 PAR)", "```{r distribution-plot, fig.height=6, fig.width=10}", "ggplot(summary_data, aes(x = Estimated_Minutes, fill = Complexity)) +", " geom_histogram(binwidth = 1, alpha = 0.7) +", " labs(title = 'Distribution of Estimated Export Times (16 Parallel)',", " x = 'Estimated Time (minutes)',", " y = 'Number of Schemas') +", " theme_minimal() +", " theme(legend.position = 'bottom') +", " scale_x_continuous(labels = scales::comma)", "```", "", "## Data Pump Command Templates for Top Schemas", "", "### Recommended Parallel Export Strategy", "For optimal performance with 16 parallel processes:", "", "1. **Group small schemas** (Estimated time < 1 minute) into batches", "2. **Export medium schemas** (1-5 minutes) individually with reduced parallelism", "3. **Large schemas** (>5 minutes) benefit most from full 16 parallel processes", "", "```{r datapump-examples, results='asis'}", "cat('### Example Data Pump Commands for Different Schema Sizes\\n\\n')", "", "# Get top 3 largest schemas", "top_3 <- head(summary_data, 3)", "", "for(i in 1:min(3, nrow(summary_data))) {", " schema <- top_3$Schema[i]", " est <- valid_results[[schema]]$estimation", " dp_info <- generate_schema_datapump_16par(schema, est)", " ", " cat('#### ', schema, ' (', est$complexity, ')\\n')", " cat('- **Size:** ', est$size_mb, ' MB\\n')", " cat('- **Estimated Time:** ', est$formatted_time, '\\n')", " cat('- **Recommended Parallelism:** ', dp_info$actual_parallel, '\\n')", " cat('- **File Size:** ', dp_info$filesize_gb, ' GB\\n')", " cat('- **Number of Files:** ', dp_info$num_files, '\\n')", " cat('```bash\\n', dp_info$command, '\\n```\\n\\n')", "}", "```", "", "## Batch Export Recommendations", "", "Based on the analysis, schemas can be grouped for optimal parallel export:", "", "```{r batch-recommendations}", "# Create batch groups", "batch_groups <- summary_data %>%", " mutate(Batch_Group = case_when(", " Estimated_Minutes < 1 ~ 'Batch 1 (Very Fast)',", " Estimated_Minutes < 5 ~ 'Batch 2 (Fast)',", " Estimated_Minutes < 15 ~ 'Batch 3 (Medium)',", " Estimated_Minutes < 30 ~ 'Batch 4 (Large)',", " TRUE ~ 'Batch 5 (Very Large)'", " )) %>%", " group_by(Batch_Group) %>%", " summarise(", " Count = n(),", " Total_Size_MB = sum(Size_MB),", " Total_Time_Min = sum(Estimated_Minutes),", " Avg_Time_Min = mean(Estimated_Minutes)", " ) %>%", " arrange(Total_Time_Min)", "", "knitr::kable(batch_groups, caption = 'Recommended Batch Groups for Parallel Export')", "```", "", "## Performance Considerations for 16 Parallel Exports", "", "### Advantages:", "1. **Maximum throughput** for large schemas", "2. **Reduced wall-clock time** for batch exports", "3. **Better resource utilization** of modern multi-core servers", "", "### Requirements:", "1. **Sufficient I/O bandwidth** (minimum 200 MB/s per parallel process)", "2. **Adequate PGA memory** for parallel query slaves", "3. **Fast storage subsystem** (SSD recommended)", "4. **Properly configured TEMP tablespace**", "", "### Monitoring During Export:", "```sql", "-- Monitor parallel processes", "SELECT * FROM v$pq_sesstat;", "", "-- Monitor active Data Pump jobs", "SELECT * FROM dba_datapump_jobs;", "", "-- Check I/O statistics", "SELECT * FROM v$iostat_file;", "```", "", "## Risk Assessment for Parallel Exports", "", "### High Risk Schemas:", "- Schemas with **LOB columns** may not benefit fully from parallelism", "- Schemas with **many constraints** may experience serialization", "- **Very small schemas** (< 100MB) may have overhead > benefit", "", "### Recommended Actions:", "1. **Pre-export testing** for complex schemas", "2. **Monitor PGA usage** during first few exports", "3. **Adjust parallelism** based on actual performance", "4. **Schedule exports** during off-peak hours", "", "## Storage Requirements for Parallel Exports", "", "| Requirement | Calculation | Notes |", "|-------------|-------------|-------|", paste0("| Dump Files | ", round(sum(summary_data$Est_Dump_Size_GB), 1), " GB | Total compressed size |"), paste0("| Temp Space | ", round(total_size/1024 * 0.1, 1), " GB | 10% of total size for sorting |"), "| Log Files | 1-3 GB | Per parallel export job |", paste0("| **Total Required** | **", round(sum(summary_data$Est_Dump_Size_GB) + total_size/1024 * 0.1 + 3, 1), " GB** | Minimum storage requirement |"), "", "## Schedule Optimization", "", paste0("Based on total estimated time of ", format_time_hhmmss(total_time_parallel), ":"), "", "- **Recommended batch size:** 4-6 schemas concurrently", "- **Optimal concurrency level:** 75% of available CPU cores", "- **Memory requirement:** 2GB PGA per parallel process", "- **Monitoring frequency:** Every 15 minutes during exports", "", "## Notes", "- This report provides estimates for **16 parallel process exports**", "- System schemas are excluded from analysis", "- Estimated times include **75% parallel efficiency factor**", "- Actual export times may vary based on:", " - I/O subsystem performance", " - Database version and configuration", " - System load during export", " - Network bandwidth to target", "- **Parallel exports require careful monitoring** to avoid resource contention", "- **Test exports recommended** for production-critical schemas", "- Report generated by automated DBA script (16 Parallel Edition)" ), temp_rmd) # Render the report tryCatch({ rmarkdown::render(temp_rmd, output_file = report_file, quiet = TRUE) message("Comprehensive report saved to: ", report_file) }, error = function(e) { warning("Failed to render PDF report: ", conditionMessage(e)) # Try HTML as fallback report_file_html <- file.path(output_dir, paste0("All_Schemas_Export_16PAR_Estimation_", timestamp, ".html")) writeLines(gsub("output: pdf_document", "output: html_document", readLines(temp_rmd)), temp_rmd) rmarkdown::render(temp_rmd, output_file = report_file_html, quiet = TRUE) report_file <- report_file_html message("HTML report saved to: ", report_file) }) # Clean up temporary file unlink(temp_rmd) return(report_file)}
# -----------------------------------------------------------------------------------# Main function to predict export time for ALL schemas with 16 parallel# -----------------------------------------------------------------------------------predict_all_schemas_export_time_16par <- function(conn, model_type = "basic_16par") { # Get all schemas message("Retrieving all schemas from database...") all_schemas <- get_all_schemas(conn) message("Found ", length(all_schemas), " schemas to analyze") # Initialize results list results <- list() # Process each schema progress <- 0 for (schema in all_schemas) { progress <- progress + 1 message(sprintf("[%03d/%03d] Processing schema: %s", progress, length(all_schemas), schema)) tryCatch({ # Get schema statistics schema_stats <- get_schema_stats(conn, schema) # Estimate export time with 16 parallel processes estimation <- estimate_export_time_16par(schema_stats, model_type) # Generate data pump command for this schema datapump_info <- generate_schema_datapump_16par(schema, estimation) # Store results results[[schema]] <- list( schema_name = schema, estimation = estimation, datapump_info = datapump_info, stats = schema_stats ) message(sprintf(" - Size: %7.1f MB, Objects: %6d, Time: %s (%s)", estimation$size_mb, estimation$object_count, estimation$formatted_time, estimation$complexity)) }, error = function(e) { warning("Failed to process schema ", schema, ": ", conditionMessage(e)) # Store empty result for failed schemas results[[schema]] <- list( schema_name = schema, estimation = list( estimated_minutes = 0, formatted_time = "00:00:00", size_mb = 0, object_count = 0, model_used = "Failed to analyze", parallel_processes = 16, parallel_efficiency = 0.75, base_time_minutes = 0, complexity = "FAILED", estimated_dump_size_gb = 0, warning = conditionMessage(e) ), datapump_info = NULL, stats = NULL ) }) # Small delay to avoid overwhelming the database Sys.sleep(0.05) } # Generate comprehensive PDF report message("\nGenerating comprehensive report for 16 parallel processes...") report_path <- generate_comprehensive_report_16par(results) # Calculate summary statistics successful_schemas <- sum(sapply(results, function(x) !is.null(x$stats))) total_parallel_time <- sum(sapply(results, function(x) x$estimation$estimated_minutes)) total_base_time <- sum(sapply(results, function(x) x$estimation$base_time_minutes)) # Return results return(list( all_results = results, report_path = report_path, total_schemas = length(all_schemas), processed_schemas = successful_schemas, summary_stats = list( total_parallel_time_minutes = total_parallel_time, total_base_time_minutes = total_base_time, time_savings_percent = if (total_base_time > 0) { round((1 - total_parallel_time / total_base_time) * 100, 1) } else { 0 }, avg_parallel_time_per_schema = if (successful_schemas > 0) { round(total_parallel_time / successful_schemas, 2) } else { 0 } ) ))}
# -----------------------------------------------------------------------------------# Execute the analysis for ALL schemas with 16 parallel processes# -----------------------------------------------------------------------------------cat(paste0("\n", strrep("=", 80), "\n"))cat("STARTING EXPORT TIME ESTIMATION FOR ALL DATABASE SCHEMAS\n")cat("OPTIMIZED FOR 16 PARALLEL PROCESSES\n")cat(paste0(strrep("=", 80), "\n"))
all_results <- tryCatch({ predict_all_schemas_export_time_16par(conn, model_type = "basic_16par")}, error = function(e) { cat("Analysis failed: ", conditionMessage(e), "\n") NULL})
# Close connection when donedbDisconnect(conn)
# -----------------------------------------------------------------------------------# Output results and summary# -----------------------------------------------------------------------------------if (!is.null(all_results)) { # Print summary cat(paste0("\n", strrep("=", 80), "\n")) cat("ANALYSIS COMPLETE - 16 PARALLEL PROCESSES\n") cat(paste0(strrep("=", 80), "\n")) cat("Total schemas analyzed:", all_results$total_schemas, "\n") cat("Successfully processed:", all_results$processed_schemas, "\n") cat("Total estimated time (16 PAR):", format_time_hhmmss(all_results$summary_stats$total_parallel_time_minutes), "\n") cat("Base time (sequential):", format_time_hhmmss(all_results$summary_stats$total_base_time_minutes), "\n") cat("Time savings:", all_results$summary_stats$time_savings_percent, "% faster\n") cat("Average time per schema:", round(all_results$summary_stats$avg_parallel_time_per_schema, 2), "minutes\n") cat("Comprehensive report saved to:", all_results$report_path, "\n\n") # Print top 10 schemas by estimated time if (length(all_results$all_results) > 0) { # Filter out failed schemas for display valid_display <- all_results$all_results[sapply(all_results$all_results, function(x) !is.null(x$stats))] if (length(valid_display) > 0) { # Extract summary data summary_list <- lapply(names(valid_display), function(schema) { est <- valid_display[[schema]]$estimation c( Schema = schema, Size_MB = est$size_mb, Objects = est$object_count, Minutes = est$estimated_minutes, Time = est$formatted_time, Complexity = est$complexity, Parallel_Saving = if (est$base_time_minutes > 0 && est$estimated_minutes > 0) { round((1 - est$estimated_minutes / est$base_time_minutes) * 100, 1) } else { 0 } ) }) summary_df <- as.data.frame(do.call(rbind, summary_list)) summary_df$Size_MB <- as.numeric(summary_df$Size_MB) summary_df$Objects <- as.numeric(summary_df$Objects) summary_df$Minutes <- as.numeric(summary_df$Minutes) summary_df$Parallel_Saving <- as.numeric(summary_df$Parallel_Saving) # Order by estimated time summary_df <- summary_df[order(-summary_df$Minutes), ] cat("TOP 10 SCHEMAS BY ESTIMATED EXPORT TIME (16 PARALLEL):\n") cat(paste0(strrep("-", 100), "\n")) cat(sprintf("%-25s %10s %10s %10s %12s %15s %12s\n", "SCHEMA", "SIZE_MB", "OBJECTS", "MINUTES", "TIME", "COMPLEXITY", "PAR_SAVING")) cat(paste0(strrep("-", 100), "\n")) for (i in 1:min(10, nrow(summary_df))) { cat(sprintf("%-25s %10.1f %10d %10.2f %12s %15s %11.1f%%\n", summary_df$Schema[i], summary_df$Size_MB[i], summary_df$Objects[i], summary_df$Minutes[i], summary_df$Time[i], summary_df$Complexity[i], summary_df$Parallel_Saving[i])) } # Print batch recommendations cat(paste0("\n", strrep("=", 80), "\n")) cat("BATCH EXPORT RECOMMENDATIONS FOR 16 PARALLEL PROCESSES\n") cat(paste0(strrep("=", 80), "\n")) # Group schemas by complexity/time fast_schemas <- summary_df[summary_df$Minutes < 1, ] medium_schemas <- summary_df[summary_df$Minutes >= 1 & summary_df$Minutes < 5, ] large_schemas <- summary_df[summary_df$Minutes >= 5 & summary_df$Minutes < 15, ] xlarge_schemas <- summary_df[summary_df$Minutes >= 15, ] cat(sprintf("Fast schemas (<1 min): %d schemas, total time: %.1f min\n", nrow(fast_schemas), sum(fast_schemas$Minutes, na.rm = TRUE))) cat(" Recommended: Export in batches of 8-10 schemas concurrently\n\n") cat(sprintf("Medium schemas (1-5 min): %d schemas, total time: %.1f min\n", nrow(medium_schemas), sum(medium_schemas$Minutes, na.rm = TRUE))) cat(" Recommended: Export in batches of 4-6 schemas concurrently\n\n") cat(sprintf("Large schemas (5-15 min): %d schemas, total time: %.1f min\n", nrow(large_schemas), sum(large_schemas$Minutes, na.rm = TRUE))) cat(" Recommended: Export individually or 2-3 concurrently\n\n") cat(sprintf("Extra Large schemas (>15 min): %d schemas, total time: %.1f min\n", nrow(xlarge_schemas), sum(xlarge_schemas$Minutes, na.rm = TRUE))) cat(" Recommended: Export individually with full 16 parallelism\n\n") cat("Total estimated batch time (optimized):", format_time_hhmmss(sum(fast_schemas$Minutes)/10 + sum(medium_schemas$Minutes)/5 + sum(large_schemas$Minutes)/2 + sum(xlarge_schemas$Minutes)), "\n") } } # Save detailed results to RDS rds_file <- paste0("all_schemas_16par_analysis_", format(Sys.time(), "%Y%m%d_%H%M"), ".rds") saveRDS(all_results, file = rds_file) cat("\nDetailed results saved to RDS file:", rds_file, "\n")}
# -----------------------------------------------------------------------------------# End of Script# -----------------------------------------------------------------------------------cat(paste0("\n", strrep("=", 80), "\n"))cat("ANALYSIS COMPLETED AT:", Sys.time(), "\n")cat(paste0(strrep("=", 80), "\n"))message("All schemas export prediction script execution completed successfully.")# -----------------------------------------------------------------------------------