diff --git a/backup.ps1 b/backup.ps1 index 6436f24..e09f238 100644 --- a/backup.ps1 +++ b/backup.ps1 @@ -21,11 +21,11 @@ param( # Each job will automatically share the database load using DatabasesInParallel=Y if Jobs>1 # TODO: See if there is way to query QueueDatabase during backup to monitor progress -# TODO: Better trapping when RSC connection fails +# TODO: Add retry/backoff strategy for transient RSC API throttling failures -$fullBackupDay = 'Thursday' +$fullBackupDay = 'Saturday' $fullBackupOverdueDays = 7 -$SAFile = "C:\Rubrik\scripts\rbksql.xml" +$SAFile = "C:\Rubrik\scripts\gmsa.xml" $logDir = "C:\Rubrik\logs" function Write-Log($message, $jobId = "") { @@ -54,6 +54,111 @@ function Write-Log($message, $jobId = "") { Write-Host $logEntry } +function Get-ExceptionMessageList { + param([System.Exception]$Exception) + + $messages = New-Object System.Collections.Generic.List[string] + + function Add-ExceptionMessages { + param( + [System.Exception]$Current, + [int]$Depth + ) + + if (-not $Current) { return } + + $indent = (' ' * $Depth) + $text = $Current.Message + if ([string]::IsNullOrWhiteSpace($text)) { + $text = "" + } + + $messages.Add("$indent$($Current.GetType().FullName): $text") + + if ($Current -is [System.AggregateException]) { + foreach ($inner in $Current.InnerExceptions) { + Add-ExceptionMessages -Current $inner -Depth ($Depth + 1) + } + } + + if ($Current.InnerException) { + Add-ExceptionMessages -Current $Current.InnerException -Depth ($Depth + 1) + } + } + + Add-ExceptionMessages -Current $Exception -Depth 0 + return @($messages | Select-Object -Unique) +} + +function Write-ExceptionDiagnostics { + param( + [string]$Prefix, + [System.Exception]$Exception + ) + + if (-not $Exception) { + Write-Log "ERROR: $Prefix Unknown exception (exception object was null)." + return + } + + $detailLines = Get-ExceptionMessageList -Exception $Exception + if ($detailLines.Count -eq 0) { + Write-Log "ERROR: $Prefix $($Exception.Message)" + return + } + + Write-Log "ERROR: $Prefix $($detailLines[0])" + for ($i = 1; $i -lt $detailLines.Count; $i++) { + Write-Log "ERROR: detail[$i]: $($detailLines[$i])" + } +} + +function Test-ServiceAccountFile { + param([string]$ServiceAccountFile) + + if ([string]::IsNullOrWhiteSpace($ServiceAccountFile)) { + throw "Service account file path is empty." + } + + if (-not (Test-Path -LiteralPath $ServiceAccountFile -PathType Leaf)) { + throw "Service account file not found: $ServiceAccountFile" + } + + try { + $resolved = Resolve-Path -LiteralPath $ServiceAccountFile -ErrorAction Stop + Write-Log "INFO: Using service account file: $resolved" + } catch { + throw "Unable to resolve service account file path '$ServiceAccountFile'. $($_.Exception.Message)" + } + + try { + # Ensure we can read the file before passing it to Connect-Rsc. + Get-Content -LiteralPath $ServiceAccountFile -Raw -ErrorAction Stop | Out-Null + } catch { + throw "Cannot read service account file '$ServiceAccountFile'. $($_.Exception.Message)" + } +} + +function Connect-RscWithDiagnostics { + param( + [string]$ServiceAccountFile, + [string]$Context = "RSC" + ) + + Write-Log "INFO: $Context Connecting to Rubrik Security Cloud..." + + try { + Test-ServiceAccountFile -ServiceAccountFile $ServiceAccountFile + $null = Connect-Rsc -ServiceAccountFile $ServiceAccountFile -ErrorAction Stop + Write-Log "INFO: $Context Connected to Rubrik Security Cloud." + } catch { + $prefix = "($Context Connect-Rsc -ServiceAccountFile $ServiceAccountFile) failed." + Write-ExceptionDiagnostics -Prefix $prefix -Exception $_.Exception + Write-Log "ERROR: Troubleshooting hints: verify file permissions/content, outbound HTTPS connectivity, proxy configuration, and local system time sync." + throw + } +} + # Parse instance name from SQL instance parameter $instanceParts = $SqlInstance -split '\\' if ($instanceParts.Length -eq 2) { @@ -180,12 +285,16 @@ $clusterInstance = Get-ClusterResource | Where-Object { $_.ResourceType -eq "SQL if ($clusterInstance) { $ownerNode = $clusterInstance.OwnerNode if ($ownerNode -ne $localNode) { - Write-Log "SQL instance '$SqlInstance' is not running on local node '$localNode'. Updating the MV." + Write-Log "SQL instance '$SqlInstance' is not running on local node '$localNode'. Updating the MV to point to $ownerNode before backup continues." - Connect-Rsc -ServiceAccountFile $SAFile - Write-Log "Connected to Rubrik Security Cloud." + try { + Connect-RscWithDiagnostics -ServiceAccountFile $SAFile -Context "Cluster owner update" + } catch { + exit 1 + } $newHost = Get-RscHost -Name $ownerNode -OsType WINDOWS + Write-Log "Found owner node in Rubrik: $($newHost.Name) (ID: $($newHost.Id))" $query = New-RscQuery -GqlQuery slaManagedVolumes -AddField Nodes.HostDetail, Nodes.SmbShare, Nodes.ClientConfig, Nodes.ClientConfig.BackupScript, Nodes.ClientConfig.PreBackupScript $query.var.filter = @(Get-RscType -Name Filter) @@ -250,8 +359,7 @@ if ($clusterInstance) { # Connect to Rubrik and retrieve managed volume paths try { - Connect-Rsc -ServiceAccountFile $SAFile - Write-Log "INFO: Connected to Rubrik Security Cloud." + Connect-RscWithDiagnostics -ServiceAccountFile $SAFile -Context "Managed volume lookup" $query = New-RscQuery -GqlQuery slaManagedVolumes -AddField Nodes.HostDetail, Nodes.SmbShare, Nodes.ClientConfig, Nodes.ClientConfig.BackupScript, Nodes.ClientConfig.PreBackupScript $query.var.filter = @(Get-RscType -Name Filter) @@ -265,9 +373,13 @@ try { } $paths = $mvDetail.nodes[0].ClientConfig.ChannelHostMountPaths + if (-not $paths -or $paths.Count -eq 0) { + Write-Log "ERROR: Managed Volume '$MvName' returned no ChannelHostMountPaths. Check MV host mapping and client config permissions." + exit 1 + } Write-Log "INFO: Retrieved paths: $($paths -join ', ')" } catch { - Write-Log "ERROR: Failed to retrieve paths from Rubrik. $($_.Exception.Message)" + Write-ExceptionDiagnostics -Prefix "Failed to retrieve paths from Rubrik." -Exception $_.Exception exit 1 }