fix: stream document+lines export to disk in small batches

The previous $expand approach tried to load all documents with lines
into memory at once, causing hangs/OOM on companies with hundreds of
thousands of records.

Changes:
- Fetch documents with $expand in small pages ($top=50) instead of
  loading everything into memory
- Stream each document to disk immediately as JSONL (one JSON object
  per line) instead of accumulating in an array
- Add automatic token refresh for long-running exports (tokens expire
  after ~60 min)
- Add 300s timeout per API request to detect stalls
- Log progress after each batch so you can see it's working

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-16 09:09:47 +01:00
parent 4af9cd7f11
commit 5ebfc3f443

View File

@@ -36,8 +36,9 @@ $entities = @(
"countriesRegions"
)
# Document entities with line items - fetched with $expand to include lines
# Document entities with line items
# Lines cannot be queried standalone; they require a parent document ID
# We fetch documents in small batches with $expand to include lines
$documentEntities = @{
"salesInvoices" = "salesInvoiceLines"
"salesOrders" = "salesOrderLines"
@@ -46,6 +47,13 @@ $documentEntities = @{
"purchaseOrders" = "purchaseOrderLines"
}
# Page size for $expand queries - kept small to avoid timeouts/OOM on large datasets
$expandPageSize = 50
# Token management
$script:currentToken = $null
$script:tokenExpiry = [datetime]::MinValue
function Write-Log {
param([string]$Message, [string]$Level = "INFO")
$timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss"
@@ -53,27 +61,24 @@ function Write-Log {
}
function Get-AzureADToken {
param(
[string]$TenantId,
[string]$ClientId,
[string]$ClientSecret
)
Write-Log "Authenticating to Azure AD..."
$tokenUrl = "https://login.microsoftonline.com/$TenantId/oauth2/v2.0/token"
$tokenUrl = "https://login.microsoftonline.com/$tenantId/oauth2/v2.0/token"
$body = @{
client_id = $ClientId
client_secret = $ClientSecret
client_id = $clientId
client_secret = $clientSecret
scope = "https://api.businesscentral.dynamics.com/.default"
grant_type = "client_credentials"
}
try {
$response = Invoke-RestMethod -Uri $tokenUrl -Method Post -Body $body -ContentType "application/x-www-form-urlencoded"
Write-Log "Successfully authenticated to Azure AD"
return $response.access_token
$script:currentToken = $response.access_token
# Refresh 5 minutes before actual expiry (tokens typically last 60-90 min)
$script:tokenExpiry = (Get-Date).AddSeconds($response.expires_in - 300)
Write-Log "Successfully authenticated to Azure AD (expires in $($response.expires_in)s)"
return $script:currentToken
}
catch {
Write-Log "Failed to authenticate: $_" "ERROR"
@@ -81,14 +86,22 @@ function Get-AzureADToken {
}
}
function Get-ValidToken {
if ($null -eq $script:currentToken -or (Get-Date) -ge $script:tokenExpiry) {
Write-Log "Token expired or missing, refreshing..."
Get-AzureADToken | Out-Null
}
return $script:currentToken
}
function Get-BCData {
param(
[string]$Token,
[string]$Url
)
$token = Get-ValidToken
$headers = @{
"Authorization" = "Bearer $Token"
"Authorization" = "Bearer $token"
"Accept" = "application/json"
}
@@ -97,7 +110,7 @@ function Get-BCData {
$currentUrl = $Url
while ($currentUrl) {
try {
$response = Invoke-RestMethod -Uri $currentUrl -Method Get -Headers $headers
$response = Invoke-RestMethod -Uri $currentUrl -Method Get -Headers $headers -TimeoutSec 300
}
catch {
Write-Log "API request failed for $currentUrl : $_" "ERROR"
@@ -116,18 +129,15 @@ function Get-BCData {
}
function Get-Companies {
param([string]$Token)
Write-Log "Fetching companies..."
$companiesUrl = "$baseUrl/companies"
$companies = Get-BCData -Token $Token -Url $companiesUrl
$companies = Get-BCData -Url $companiesUrl
Write-Log "Found $($companies.Count) company/companies"
return $companies
}
function Export-EntityData {
param(
[string]$Token,
[string]$CompanyId,
[string]$CompanyName,
[string]$EntityName,
@@ -139,7 +149,7 @@ function Export-EntityData {
Write-Log " Exporting $EntityName..."
try {
$data = Get-BCData -Token $Token -Url $entityUrl
$data = Get-BCData -Url $entityUrl
$count = 0
if ($data) { $count = $data.Count }
@@ -151,7 +161,6 @@ function Export-EntityData {
}
catch {
Write-Log " Failed to export ${EntityName}: $_" "WARN"
# Write empty array so downstream knows it was attempted
$outputFile = Join-Path $OutputDir "$EntityName.json"
"[]" | Out-File -FilePath $outputFile -Encoding utf8
return 0
@@ -160,7 +169,6 @@ function Export-EntityData {
function Export-DocumentWithLines {
param(
[string]$Token,
[string]$CompanyId,
[string]$CompanyName,
[string]$DocumentEntity,
@@ -168,34 +176,62 @@ function Export-DocumentWithLines {
[string]$OutputDir
)
$entityUrl = "$baseUrl/companies($CompanyId)/$DocumentEntity" + '?$expand=' + $LineEntity
# Use $expand with small $top to avoid huge payloads
# BC API will provide @odata.nextLink for subsequent pages
$entityUrl = "$baseUrl/companies($CompanyId)/$DocumentEntity" + '?$expand=' + $LineEntity + '&$top=' + $expandPageSize
Write-Log " Exporting $DocumentEntity (with $LineEntity)..."
Write-Log " Exporting $DocumentEntity (with $LineEntity, batch size $expandPageSize)..."
$outputFile = Join-Path $OutputDir "$DocumentEntity.jsonl"
$docCount = 0
$lineCount = 0
$currentUrl = $entityUrl
try {
$data = Get-BCData -Token $Token -Url $entityUrl
$docCount = 0
$lineCount = 0
if ($data) {
$docCount = $data.Count
foreach ($doc in $data) {
if ($doc.$LineEntity) {
$lineCount += $doc.$LineEntity.Count
# Clear output file
[System.IO.File]::WriteAllText($outputFile, "")
while ($currentUrl) {
$token = Get-ValidToken
$headers = @{
"Authorization" = "Bearer $token"
"Accept" = "application/json"
}
try {
$response = Invoke-RestMethod -Uri $currentUrl -Method Get -Headers $headers -TimeoutSec 300
}
catch {
Write-Log " API request failed at doc #$docCount : $_" "ERROR"
throw
}
if ($response.value) {
foreach ($doc in $response.value) {
$lines = 0
if ($doc.$LineEntity) {
$lines = $doc.$LineEntity.Count
}
$lineCount += $lines
$docCount++
# Write each document immediately to disk (JSONL: one JSON object per line)
$jsonLine = $doc | ConvertTo-Json -Depth 10 -Compress
[System.IO.File]::AppendAllText($outputFile, $jsonLine + "`n")
}
}
Write-Log " Progress: $docCount documents, $lineCount lines"
# Next page
$currentUrl = $response.'@odata.nextLink'
}
# Save the documents (with lines embedded)
$outputFile = Join-Path $OutputDir "$DocumentEntity.json"
$data | ConvertTo-Json -Depth 10 | Out-File -FilePath $outputFile -Encoding utf8
Write-Log " $DocumentEntity : $docCount documents, $lineCount lines"
Write-Log " $DocumentEntity : $docCount documents, $lineCount lines (complete)"
return ($docCount + $lineCount)
}
catch {
Write-Log " Failed to export ${DocumentEntity} with lines: $_" "WARN"
$outputFile = Join-Path $OutputDir "$DocumentEntity.json"
"[]" | Out-File -FilePath $outputFile -Encoding utf8
Write-Log " Failed to export ${DocumentEntity} with lines at doc #$docCount : $_" "WARN"
return 0
}
}
@@ -216,10 +252,10 @@ try {
}
# Step 1: Get Azure AD token
$token = Get-AzureADToken -TenantId $tenantId -ClientId $clientId -ClientSecret $clientSecret
Get-AzureADToken | Out-Null
# Step 2: Get companies
$companies = Get-Companies -Token $token
$companies = Get-Companies
if ($companies.Count -eq 0) {
Write-Log "No companies found in environment $environmentName" "ERROR"
@@ -262,7 +298,6 @@ try {
# Export standalone entities
foreach ($entity in $entities) {
$count = Export-EntityData `
-Token $token `
-CompanyId $companyId `
-CompanyName $companyName `
-EntityName $entity `
@@ -276,12 +311,12 @@ try {
}
}
# Export document entities with their line items via $expand
# Export document entities with their line items
# Uses small batches + streaming to disk to handle large datasets
foreach ($docEntity in $documentEntities.Keys) {
$lineEntity = $documentEntities[$docEntity]
$count = Export-DocumentWithLines `
-Token $token `
-CompanyId $companyId `
-CompanyName $companyName `
-DocumentEntity $docEntity `