Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ees 5686 add public api performance metric alerts #5463

Open
wants to merge 10 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,11 @@ module apiContainerAppModule '../../components/containerApp.bicep' = {
]
requireAuthentication: false
}
tagValues: tagValues
}
}

module containerAppRestartsAlert '../../components/alerts/containerApps/restarts.bicep' = if (deployAlerts) {
name: '${resourceNames.publicApi.apiApp}RestartsDeploy'
params: {
resourceNames: [resourceNames.publicApi.apiApp]
alertsGroupName: resourceNames.existingResources.alertsGroup
alerts: deployAlerts ? {
restarts: true
responseTime: true
alertsGroupName: resourceNames.existingResources.alertsGroup
} : null
tagValues: tagValues
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,41 +112,14 @@ module dataProcessorFunctionAppModule '../../components/functionApp.bicep' = {
mountPath: publicApiDataFileShareMountPath
}]
storageFirewallRules: storageFirewallRules
tagValues: tagValues
}
}

module functionAppHealthAlert '../../components/alerts/sites/healthAlert.bicep' = if (deployAlerts) {
name: '${resourceNames.publicApi.dataProcessor}HealthDeploy'
params: {
resourceNames: [resourceNames.publicApi.dataProcessor]
alertsGroupName: resourceNames.existingResources.alertsGroup
tagValues: tagValues
}
}

module storageAccountAvailabilityAlerts '../../components/alerts/storageAccounts/availabilityAlert.bicep' = if (deployAlerts) {
name: '${resourceNames.publicApi.dataProcessor}StorageAvailabilityDeploy'
params: {
resourceNames: [
dataProcessorFunctionAppModule.outputs.managementStorageAccountName
dataProcessorFunctionAppModule.outputs.slot1StorageAccountName
dataProcessorFunctionAppModule.outputs.slot2StorageAccountName
]
alertsGroupName: resourceNames.existingResources.alertsGroup
tagValues: tagValues
}
}

module fileServiceAvailabilityAlerts '../../components/alerts/fileServices/availabilityAlert.bicep' = if (deployAlerts) {
name: '${resourceNames.publicApi.dataProcessor}FsAvailabilityDeploy'
params: {
resourceNames: [
dataProcessorFunctionAppModule.outputs.managementStorageAccountName
dataProcessorFunctionAppModule.outputs.slot1StorageAccountName
dataProcessorFunctionAppModule.outputs.slot2StorageAccountName
]
alertsGroupName: resourceNames.existingResources.alertsGroup
alerts: deployAlerts ? {
functionAppHealth: true
storageAccountAvailability: true
storageLatency: true
fileServiceAvailability: true
fileServiceLatency: true
alertsGroupName: resourceNames.existingResources.alertsGroup
} : null
tagValues: tagValues
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ module publicApiStorageAccountModule '../../components/storageAccount.bicep' = {
firewallRules: storageFirewallRules
skuStorageResource: 'Standard_LRS'
keyVaultName: resourceNames.existingResources.keyVault
alerts: deployAlerts ? {
availability: true
latency: true
alertsGroupName: resourceNames.existingResources.alertsGroup
} : null
tagValues: tagValues
}
}
Expand All @@ -62,23 +67,11 @@ module dataFilesFileShareModule '../../components/fileShare.bicep' = {
fileShareQuota: publicApiDataFileShareQuota
storageAccountName: publicApiStorageAccountModule.outputs.storageAccountName
fileShareAccessTier: 'TransactionOptimized'
}
}

module storageAccountAvailabilityAlert '../../components/alerts/storageAccounts/availabilityAlert.bicep' = if (deployAlerts) {
name: '${resourceNames.publicApi.publicApiStorageAccount}AvailabilityDeploy'
params: {
resourceNames: [resourceNames.publicApi.publicApiStorageAccount]
alertsGroupName: resourceNames.existingResources.alertsGroup
tagValues: tagValues
}
}

module fileServiceAvailabilityAlert '../../components/alerts/fileServices/availabilityAlert.bicep' = if (deployAlerts) {
name: '${resourceNames.publicApi.publicApiStorageAccount}FsAvailabilityDeploy'
params: {
resourceNames: [resourceNames.publicApi.publicApiStorageAccount]
alertsGroupName: resourceNames.existingResources.alertsGroup
alerts: deployAlerts ? {
availability: true
latency: true
alertsGroupName: resourceNames.existingResources.alertsGroup
} : null
tagValues: tagValues
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,11 @@ module appGatewayModule '../../components/appGateway.bicep' = {
backends: backends
routes: routes
rewrites: rewrites
tagValues: tagValues
}
}

module backendPoolsHealthAlert '../../components/alerts/appGateways/backendPoolHealth.bicep' = if (deployAlerts) {
name: '${resourceNames.sharedResources.appGateway}BackendPoolsHealthDeploy'
params: {
resourceNames: [resourceNames.sharedResources.appGateway]
alertsGroupName: resourceNames.existingResources.alertsGroup
alerts: deployAlerts ? {
health: true
responseTime: true
alertsGroupName: resourceNames.existingResources.alertsGroup
} : null
tagValues: tagValues
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ module postgreSqlServerModule '../../components/postgresqlDatabase.bicep' = {
firewallRules: formattedFirewallRules
databaseNames: ['public_data']
privateEndpointSubnetId: privateEndpointSubnetId
alerts: deployAlerts ? {
availability: true
queryTime: true
transactionTime: true
alertGroupName: resourceNames.existingResources.alertsGroup
} : null
tagValues: tagValues
}
}
Expand All @@ -73,15 +79,6 @@ resource maxPreparedTransactionsConfig 'Microsoft.DBforPostgreSQL/flexibleServer
]
}

module databaseAliveAlert '../../components/alerts/flexibleServers/databaseAlive.bicep' = if (deployAlerts) {
name: '${resourceNames.sharedResources.postgreSqlFlexibleServer}DbAliveDeploy'
params: {
resourceNames: [resourceNames.sharedResources.postgreSqlFlexibleServer]
alertsGroupName: resourceNames.existingResources.alertsGroup
tagValues: tagValues
}
}

var managedIdentityConnectionStringTemplate = postgreSqlServerModule.outputs.managedIdentityConnectionStringTemplate

var dataProcessorPsqlConnectionStringSecretKey = 'ees-publicapi-data-processor-connectionstring-publicdatadb'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { Severity } from '../types.bicep'

@description('Names of the resources that these alerts are being applied to.')
param resourceNames string[]

@description('The alert severity.')
param severity Severity = 'Warning'

@description('Name of the Alerts Group used to send alert messages.')
param alertsGroupName string

@description('Tags with which to tag the resource in Azure.')
param tagValues object

module alerts '../dynamicMetricAlert.bicep' = [for name in resourceNames: {
name: '${name}ResponseTimeAlertModule'
params: {
alertName: '${name}-response-time'
resourceIds: [resourceId('Microsoft.Network/applicationGateways', name)]
resourceType: 'Microsoft.Network/applicationGateways'
query: {
metric: 'ApplicationGatewayTotalTime'
aggregation: 'Average'
operator: 'GreaterThan'
}
evaluationFrequency: 'PT1M'
windowSize: 'PT5M'
severity: severity
alertsGroupName: alertsGroupName
tagValues: tagValues
}
}]
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { Severity } from '../types.bicep'

@description('Names of the resources that these alerts are being applied to.')
param resourceNames string[]

@description('The alert severity.')
param severity Severity = 'Warning'

@description('Name of the Alerts Group used to send alert messages.')
param alertsGroupName string

@description('Tags with which to tag the resource in Azure.')
param tagValues object

module alerts '../dynamicMetricAlert.bicep' = [for name in resourceNames: {
name: '${name}LatencyAlertModule'
params: {
alertName: '${name}-latency'
resourceIds: [resourceId('Microsoft.App/containerApps', name)]
resourceType: 'Microsoft.App/containerApps'
query: {
metric: 'ResponseTime'
aggregation: 'Average'
operator: 'GreaterThan'
}
evaluationFrequency: 'PT1M'
windowSize: 'PT5M'
severity: severity
alertsGroupName: alertsGroupName
tagValues: tagValues
}
}]
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import {
EvaluationFrequency
MetricName
DynamicMetricOperator
ResourceType
TimeAggregation
WindowSize
Severity
Sensitivity
severityMapping
} from 'types.bicep'

@description('Name of the alert.')
param alertName string

@description('Ids of the resources that this alert is being applied to.')
param resourceIds string[]

@description('Type of the resource that this alert is being applied to.')
param resourceType ResourceType

@description('The query being used to test if the alert should be fired.')
param query {
metric: MetricName
aggregation: TimeAggregation
operator: DynamicMetricOperator
}

@description('''
The frequency with which this alert rule evaluates the metrics against the dynamic thresholds.
For instance, PT1M with a window size of PT5M will evaluate the past 5 minutes' worth of metric data
against the dynamic threshold every minute.
''')
param evaluationFrequency EvaluationFrequency = 'PT1M'

@description('''
The timespan that is used to calculate the metric's value against the specified time aggregation.
For instance, PT5M with a time aggregation of "Average" will use 5 minutes of metric data to calculate
the average value, which is then compared to the dynamic threshold.
''')
param windowSize WindowSize = 'PT5M'

@description('''
How sensitive the alert is if a metric exceeds its dynamic threshold.
Low sensitivity means that this alert will fire only if the metric exceeds the threshold by a high degree.
High sensitivity means that this alert will fire if the metric exceeds the threshold to a much lower degree.
''')
param sensitivity Sensitivity = 'Low'

@description('''
How many periods to look back over to count failing periods. Used in conjunction with "minFailingPeriodsToAlert".
As an example, if "numberOfEvaluationPeriods" is set to 5 and "evaluationFrequency" is set to every minute, the past
5 alerts (one for each of the last 5 minutes) is looked at and each failure is counted up.
''')
param numberOfEvaluationPeriods int = 5

@description('''
How many of the "numberOfEvaluationPeriods" results need to have failed in order for this rule to fire.
For instance, if this rule is using the past 5 calculations (with "numberOfEvaluationPeriods" set to 5) to evaluate
whether or not to fire, "minFailingPeriodsToAlert" determines how many of those past 5 periods have to have failed
in order for this rule to fire. If this was set to 3, 3 out of the 5 past calculations will have had to fail in
order for this rule to fire.
''')
param minFailingPeriodsToAlert int = 5

@description('The alert severity.')
param severity Severity = 'Error'

@description('''
An optional date that prevents machine learning algorithms from using metric data prior to this date in order to
calculate its dynamic threshold.
''')
param ignoreDataBefore string?

@description('Name of the Alerts Group used to send alert messages.')
param alertsGroupName string

@description('Tags with which to tag the resource in Azure.')
param tagValues object

var severityLevel = severityMapping[severity]

resource alertsActionGroup 'Microsoft.Insights/actionGroups@2023-01-01' existing = {
name: alertsGroupName
}

resource metricAlertRule 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'Global'
properties: {
enabled: true
scopes: resourceIds
severity: severityLevel
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
criterionType: 'DynamicThresholdCriterion'
name: 'Metric1'
metricName: query.metric
metricNamespace: resourceType
timeAggregation: query.aggregation
operator: query.operator
alertSensitivity: sensitivity
skipMetricValidation: false
failingPeriods: {
minFailingPeriodsToAlert: minFailingPeriodsToAlert
numberOfEvaluationPeriods: numberOfEvaluationPeriods
}
ignoreDataBefore: ignoreDataBefore
}
]
}
actions: [
{
actionGroupId: alertsActionGroup.id
}
]
}
tags: tagValues
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { Severity } from '../types.bicep'

@description('Names of the resources that these alerts are being applied to.')
param resourceNames string[]

@description('The alert severity.')
param severity Severity = 'Warning'

@description('Name of the Alerts Group used to send alert messages.')
param alertsGroupName string

@description('Tags with which to tag the resource in Azure.')
param tagValues object

module alerts '../dynamicMetricAlert.bicep' = [for name in resourceNames: {
name: '${name}FsLatencyAlertModule'
params: {
alertName: '${name}-fileservice-latency'
resourceIds: [resourceId('Microsoft.Storage/storageAccounts/fileServices', name, 'default')]
resourceType: 'Microsoft.Storage/storageAccounts/fileServices'
query: {
metric: 'SuccessE2ELatency'
aggregation: 'Average'
operator: 'GreaterThan'
}
evaluationFrequency: 'PT1M'
windowSize: 'PT5M'
severity: severity
alertsGroupName: alertsGroupName
tagValues: tagValues
}
}]
Loading
Loading