2020-10-24 23:57:45 +05:30
|
|
|
dashboard: 'Overview'
|
2020-03-13 15:44:24 +05:30
|
|
|
priority: 1
|
2020-06-23 00:09:42 +05:30
|
|
|
|
|
|
|
templating:
|
|
|
|
variables:
|
|
|
|
instance:
|
|
|
|
type: 'text'
|
|
|
|
label: 'Instance label regex'
|
|
|
|
options:
|
|
|
|
default_value: '.+'
|
|
|
|
|
2020-03-13 15:44:24 +05:30
|
|
|
panel_groups:
|
2020-06-23 00:09:42 +05:30
|
|
|
|
|
|
|
- group: 'Resource usage'
|
|
|
|
panels:
|
|
|
|
- title: "Memory usage"
|
|
|
|
type: "line-chart"
|
|
|
|
y_label: "% memory used"
|
|
|
|
metrics:
|
|
|
|
- id: node_memory_usage_percentage
|
|
|
|
query_range: '(1 - (node_memory_MemAvailable_bytes{instance=~"{{instance}}"} or (node_memory_MemFree_bytes{instance=~"{{instance}}"} + node_memory_Buffers_bytes{instance=~"{{instance}}"} + node_memory_Cached_bytes{instance=~"{{instance}}"} + node_memory_Slab_bytes{instance=~"{{instance}}"})) / node_memory_MemTotal_bytes{instance=~"{{instance}}"}) * 100'
|
|
|
|
unit: "%"
|
|
|
|
label: instance
|
|
|
|
|
|
|
|
- title: "CPU usage"
|
|
|
|
type: "line-chart"
|
|
|
|
y_label: "% CPU used"
|
|
|
|
metrics:
|
|
|
|
- id: node_cpu_usage_percentage
|
|
|
|
query_range: '(avg without (mode,cpu) (1 - irate(node_cpu_seconds_total{mode="idle",instance=~"{{instance}}"}[5m]))) * 100'
|
|
|
|
unit: "%"
|
|
|
|
label: instance
|
|
|
|
|
2020-03-13 15:44:24 +05:30
|
|
|
- group: Web Service
|
|
|
|
panels:
|
|
|
|
- title: Web Service - Error Ratio
|
|
|
|
type: line-chart
|
|
|
|
y_label: "Unhandled Exceptions (%)"
|
|
|
|
metrics:
|
|
|
|
- id: wser_web_service
|
|
|
|
query_range: 'max(max_over_time(gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="web", stage="main"}[1m])) by (type) * 100'
|
|
|
|
unit: "%"
|
|
|
|
label: "Error Ratio"
|
|
|
|
- id: wser_degradation_slo
|
|
|
|
query_range: 'avg(slo:max:gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="web", stage="main"}) or avg(slo:max:gitlab_service_errors:ratio{type="web"}) * 100'
|
|
|
|
unit: "%"
|
|
|
|
label: "Degradation SLO"
|
|
|
|
- id: wser_outage_slo
|
|
|
|
query_range: '2 * (avg(slo:max:gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="web", stage="main"}) or avg(slo:max:gitlab_service_errors:ratio{type="web"})) * 100'
|
|
|
|
unit: "%"
|
|
|
|
label: "Outage SLO"
|
|
|
|
- group: API Service
|
|
|
|
panels:
|
|
|
|
- title: API Service - Error Ratio
|
|
|
|
type: line-chart
|
|
|
|
y_label: "Unhandled Exceptions (%)"
|
|
|
|
metrics:
|
|
|
|
- id: aser_web_service
|
|
|
|
query_range: 'max(max_over_time(gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="api", stage="main"}[1m])) by (type) * 100'
|
|
|
|
unit: "%"
|
|
|
|
label: "Error Ratio"
|
|
|
|
- id: aser_degradation_slo
|
|
|
|
query_range: 'avg(slo:max:gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="api", stage="main"}) or avg(slo:max:gitlab_service_errors:ratio{type="web"}) * 100'
|
|
|
|
unit: "%"
|
|
|
|
label: "Degradation SLO"
|
|
|
|
- id: aser_outage_slo
|
|
|
|
query_range: '2 * (avg(slo:max:gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="api", stage="main"}) or avg(slo:max:gitlab_service_errors:ratio{type="web"})) * 100'
|
|
|
|
unit: "%"
|
|
|
|
label: "Outage SLO"
|