Collapse All Expand All
vm-health (every 60s) 6

/etc/alerts/alerts-health.yml

Rule Samples Updated
alert: TooManyRestarts (for: 0 seconds) | Details
changes(process_start_time_seconds{job=~"victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2
Labels: severity=critical
0 43.354s ago
alert: ServiceDown (for: 120 seconds) | Details
up{job=~"victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert"} == 0
Labels: severity=critical
0 43.353s ago
alert: ProcessNearFDLimits (for: 300 seconds) | Details
(process_max_fds - process_open_fds) < 100
Labels: severity=critical
0 43.352s ago
alert: TooHighMemoryUsage (for: 300 seconds) | Details
(process_resident_memory_anon_bytes / vm_available_memory_bytes) > 0.9
Labels: severity=critical
0 43.351s ago
alert: TooHighCPUUsage (for: 300 seconds) | Details
rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
Labels: severity=critical
0 43.351s ago
alert: TooManyLogs (for: 900 seconds) | Details
sum(increase(vm_log_messages_total{level="error"}[5m])) by (job, instance) > 0
Labels: severity=warning
0 43.350s ago
vmagent (every 30s) 10

/etc/alerts/alerts-vmagent.yml

Rule Samples Updated
alert: PersistentQueueIsDroppingData (for: 600 seconds) | Details
sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0
Labels: severity=critical
0 29.663s ago
alert: RejectedRemoteWriteDataBlocksAreDropped (for: 900 seconds) | Details
sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) by (job, instance) > 0
Labels: severity=warning
0 29.663s ago
alert: TooManyScrapeErrors (for: 900 seconds) | Details
sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0
Labels: severity=warning
1 29.662s ago
alert: TooManyWriteErrors (for: 900 seconds) | Details
(sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance)
+
sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0
Labels: severity=warning
0 29.662s ago
alert: TooManyRemoteWriteErrors (for: 900 seconds) | Details
sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0
Labels: severity=warning
0 29.661s ago
alert: RemoteWriteConnectionIsSaturated (for: 900 seconds) | Details
sum(rate(vmagent_remotewrite_send_duration_seconds_total[5m])) by(job, instance, url) 
> 0.9 * max(vmagent_remotewrite_queues) by(job, instance, url)
Labels: severity=warning
0 29.660s ago
alert: PersistentQueueForWritesIsSaturated (for: 900 seconds) | Details
rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
Labels: severity=warning
0 29.660s ago
alert: PersistentQueueForReadsIsSaturated (for: 900 seconds) | Details
rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
Labels: severity=warning
0 29.660s ago
alert: SeriesLimitHourReached (for: 0 seconds) | Details
(vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
Labels: severity=critical
0 29.660s ago
alert: SeriesLimitDayReached (for: 0 seconds) | Details
(vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
Labels: severity=critical
0 29.659s ago
vmsingle (every 30s) 9

/etc/alerts/alerts.yml

Rule Samples Updated
alert: DiskRunsOutOfSpaceIn3Days (for: 1800 seconds) | Details
vm_free_disk_space_bytes / ignoring(path)
(
   (
    rate(vm_rows_added_to_storage_total[1d]) -
    ignoring(type) rate(vm_deduplicated_samples_total{type="merge"}[1d])
   )
  * scalar(
    sum(vm_data_size_bytes{type!="indexdb"}) /
    sum(vm_rows{type!="indexdb"})
   )
) < 3 * 24 * 3600 > 0
Labels: severity=critical
0 18.181s ago
alert: DiskRunsOutOfSpace (for: 1800 seconds) | Details
sum(vm_data_size_bytes) by(instance) /
(
 sum(vm_free_disk_space_bytes) by(instance) +
 sum(vm_data_size_bytes) by(instance)
) > 0.8
Labels: severity=critical
0 18.181s ago
alert: RequestErrorsToAPI (for: 900 seconds) | Details
increase(vm_http_request_errors_total[5m]) > 0
Labels: severity=warning
0 18.179s ago
alert: ConcurrentFlushesHitTheLimit (for: 900 seconds) | Details
avg_over_time(vm_concurrent_addrows_current[1m]) >= vm_concurrent_addrows_capacity
Labels: severity=warning
0 18.179s ago
alert: RowsRejectedOnIngestion (for: 900 seconds) | Details
sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0
Labels: severity=warning
0 18.178s ago
alert: TooHighChurnRate (for: 900 seconds) | Details
(
   sum(rate(vm_new_timeseries_created_total[5m])) by(instance)
   /
   sum(rate(vm_rows_inserted_total[5m])) by (instance)
 ) > 0.1
Labels: severity=warning
0 18.178s ago
alert: TooHighChurnRate24h (for: 900 seconds) | Details
sum(increase(vm_new_timeseries_created_total[24h])) by(instance)
>
(sum(vm_cache_entries{type="storage/hour_metric_ids"}) by(instance) * 3)
Labels: severity=warning
0 18.178s ago
alert: TooHighSlowInsertsRate (for: 900 seconds) | Details
(
   sum(rate(vm_slow_row_inserts_total[5m])) by(instance)
   /
   sum(rate(vm_rows_inserted_total[5m])) by (instance)
 ) > 0.05
Labels: severity=warning
0 18.177s ago
alert: LabelsLimitExceededOnIngestion (for: 900 seconds) | Details
sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0
Labels: severity=warning
0 18.177s ago