Skip to content

🚨 GCP Infrastructure Runbooks

ΠŸΡ€ΠΎΡ„Π΅ΡΡΠΈΠΎΠ½Π°Π»ΡŒΠ½Ρ‹Π΅ runbooks для ΠΎΠΏΠ΅Ρ€Π°Ρ†ΠΈΠΉ с GCP инфраструктурой

πŸ“‹ Π‘ΠΎΠ΄Π΅Ρ€ΠΆΠ°Π½ΠΈΠ΅


1. Cloud SQL Operations

1.1 ΠŸΠΎΠ΄ΠΊΠ»ΡŽΡ‡Π΅Π½ΠΈΠ΅ ΠΊ Π±Π°Π·Π΅ Π΄Π°Π½Π½Ρ‹Ρ…

1
2
3
4
5
6
# Π§Π΅Ρ€Π΅Π· Cloud SQL Proxy (рСкомСндуСтся)
cloud-sql-proxy viktor-integration:us-central1:zashita-postgres-production &
psql -h 127.0.0.1 -U app_user -d zashita_production

# Π§Π΅Ρ€Π΅Π· gcloud (для администрирования)
gcloud sql connect zashita-postgres-production --user=postgres --project=viktor-integration

1.2 Failover Π½Π° Ρ€Π΅ΠΏΠ»ΠΈΠΊΡƒ

Когда ΠΈΡΠΏΠΎΠ»ΡŒΠ·ΠΎΠ²Π°Ρ‚ΡŒ: Primary нСдоступСн Π±ΠΎΠ»Π΅Π΅ 5 ΠΌΠΈΠ½ΡƒΡ‚

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ статус
gcloud sql instances describe zashita-postgres-production \
  --format="value(state,failoverReplica.name)"

# ΠŸΡ€ΠΈΠ½ΡƒΠ΄ΠΈΡ‚Π΅Π»ΡŒΠ½Ρ‹ΠΉ failover
gcloud sql instances failover zashita-postgres-production \
  --project=viktor-integration

# ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ Π½ΠΎΠ²Ρ‹ΠΉ primary
gcloud sql instances describe zashita-postgres-production \
  --format="table(name,state,ipAddresses[0].ipAddress)"

1.3 ВосстановлСниС ΠΈΠ· бэкапа

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
# Бписок бэкапов
gcloud sql backups list \
  --instance=zashita-postgres-production \
  --project=viktor-integration

# Π’ΠΎΡΡΡ‚Π°Π½ΠΎΠ²ΠΈΡ‚ΡŒ Π² Π½ΠΎΠ²Ρ‹ΠΉ инстанс (рСкомСндуСтся)
gcloud sql instances restore-backup zashita-postgres-production \
  --restore-instance=zashita-postgres-restored \
  --backup-id=BACKUP_ID \
  --project=viktor-integration

# Или Π²ΠΎΡΡΡ‚Π°Π½ΠΎΠ²ΠΈΡ‚ΡŒ point-in-time
gcloud sql instances clone zashita-postgres-production zashita-postgres-pitr \
  --point-in-time="2026-02-07T10:00:00Z" \
  --project=viktor-integration

1.4 АварийноС ΠΌΠ°ΡΡˆΡ‚Π°Π±ΠΈΡ€ΠΎΠ²Π°Π½ΠΈΠ΅

1
2
3
4
5
6
7
8
9
# Π£Π²Π΅Π»ΠΈΡ‡ΠΈΡ‚ΡŒ CPU/RAM
gcloud sql instances patch zashita-postgres-production \
  --tier=db-custom-4-16384 \
  --project=viktor-integration

# Π£Π²Π΅Π»ΠΈΡ‡ΠΈΡ‚ΡŒ storage
gcloud sql instances patch zashita-postgres-production \
  --storage-size=200GB \
  --project=viktor-integration

1.5 ΠœΠ΅Π΄Π»Π΅Π½Π½Ρ‹Π΅ запросы

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
-- Найти ΠΌΠ΅Π΄Π»Π΅Π½Π½Ρ‹Π΅ запросы
SELECT pid, now() - pg_stat_activity.query_start AS duration, query
FROM pg_stat_activity
WHERE state = 'active' AND (now() - pg_stat_activity.query_start) > interval '5 seconds';

-- Π£Π±ΠΈΡ‚ΡŒ запрос
SELECT pg_terminate_backend(pid);

-- ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ Π±Π»ΠΎΠΊΠΈΡ€ΠΎΠ²ΠΊΠΈ
SELECT * FROM pg_locks WHERE NOT granted;

2. Redis Operations

2.1 ΠŸΠΎΠ΄ΠΊΠ»ΡŽΡ‡Π΅Π½ΠΈΠ΅ ΠΊ Redis

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
# ΠŸΠΎΠ»ΡƒΡ‡ΠΈΡ‚ΡŒ хост
REDIS_HOST=$(gcloud redis instances describe zashita-redis-production \
  --region=us-central1 --format='value(host)')

# ΠŸΠΎΠ΄ΠΊΠ»ΡŽΡ‡Π΅Π½ΠΈΠ΅ Ρ‡Π΅Ρ€Π΅Π· jump host
ssh -L 6379:$REDIS_HOST:6379 user@jump-host

# Или Ρ‡Π΅Ρ€Π΅Π· Compute Engine VM
gcloud compute ssh redis-bastion --zone=us-central1-a -- \
  redis-cli -h $REDIS_HOST

2.2 Failover Redis HA

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ статус
gcloud redis instances describe zashita-redis-production \
  --region=us-central1 \
  --format="table(name,state,host,currentLocationId)"

# ΠŸΡ€ΠΈΠ½ΡƒΠ΄ΠΈΡ‚Π΅Π»ΡŒΠ½Ρ‹ΠΉ failover
gcloud redis instances failover zashita-redis-production \
  --region=us-central1 \
  --data-protection-mode=limited-data-loss

# ПослС failover ΠΎΠ±Π½ΠΎΠ²ΠΈΡ‚ΡŒ DNS/ΠΊΠΎΠ½Ρ„ΠΈΠ³

2.3 ΠžΡ‡ΠΈΡΡ‚ΠΊΠ° памяти

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
redis-cli -h $REDIS_HOST

# ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ использованиС памяти
INFO memory

# Найти большиС ΠΊΠ»ΡŽΡ‡ΠΈ
redis-cli -h $REDIS_HOST --bigkeys

# ΠžΡ‡ΠΈΡΡ‚ΠΈΡ‚ΡŒ ΠΊΠΎΠ½ΠΊΡ€Π΅Ρ‚Π½Ρ‹ΠΉ ΠΏΠ°Ρ‚Ρ‚Π΅Ρ€Π½
redis-cli -h $REDIS_HOST KEYS "session:expired:*" | xargs redis-cli DEL

# ΠŸΡ€ΠΈΠ½ΡƒΠ΄ΠΈΡ‚Π΅Π»ΡŒΠ½Π°Ρ дСфрагмСнтация
CONFIG SET activedefrag yes

2.4 Export/Import Π΄Π°Π½Π½Ρ‹Ρ…

1
2
3
4
5
6
7
8
9
# Export Π² GCS
gcloud redis instances export zashita-redis-production \
  gs://zashita-backups-production/redis/export-$(date +%Y%m%d).rdb \
  --region=us-central1

# Import ΠΈΠ· GCS
gcloud redis instances import zashita-redis-production \
  gs://zashita-backups-production/redis/export-20260207.rdb \
  --region=us-central1

3. Pub/Sub Operations

3.1 ΠœΠΎΠ½ΠΈΡ‚ΠΎΡ€ΠΈΠ½Π³ ΠΎΡ‡Π΅Ρ€Π΅Π΄Π΅ΠΉ

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
# ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ backlog
gcloud pubsub subscriptions describe events-worker-subscription \
  --format="table(name,ackDeadlineSeconds)" \
  --project=viktor-integration

# ΠœΠ΅Ρ‚Ρ€ΠΈΠΊΠΈ
gcloud monitoring metrics list \
  --filter="metric.type:pubsub" \
  --project=viktor-integration | head -20

# Бписок unacked сообщСний
gcloud pubsub subscriptions pull events-worker-subscription \
  --limit=10 --auto-ack=false

3.2 Dead Letter Queue ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠ°

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
# ΠŸΡ€ΠΎΡΠΌΠΎΡ‚Ρ€Π΅Ρ‚ΡŒ DLQ
gcloud pubsub subscriptions pull events-dlq-subscription \
  --limit=100 --format=json > dlq_messages.json

# ΠŸΠΎΠ²Ρ‚ΠΎΡ€Π½Π°Ρ публикация послС фикса
cat dlq_messages.json | jq -r '.[] | .message.data' | \
  while read msg; do
    echo "$msg" | base64 -d | \
    gcloud pubsub topics publish events-topic --message="$(cat)"
  done

# ΠžΡ‡ΠΈΡΡ‚ΠΈΡ‚ΡŒ DLQ (послС ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΈ)
gcloud pubsub subscriptions seek events-dlq-subscription \
  --time=$(date -u +%Y-%m-%dT%H:%M:%SZ)

3.3 ΠœΠ°ΡΡˆΡ‚Π°Π±ΠΈΡ€ΠΎΠ²Π°Π½ΠΈΠ΅ подписчиков

1
2
3
4
5
6
7
# Π£Π²Π΅Π»ΠΈΡ‡ΠΈΡ‚ΡŒ ΠΏΠ°Ρ€Π°Π»Π»Π΅Π»ΠΈΠ·ΠΌ (Π² ΠΏΡ€ΠΈΠ»ΠΎΠΆΠ΅Π½ΠΈΠΈ)
# ΠžΠ±Π½ΠΎΠ²ΠΈΡ‚ΡŒ flowControl Π² subscriber config:
# maxOutstandingMessages: 1000 -> 5000
# maxOutstandingBytes: 100MB -> 500MB

# Или Ρ€Π°Π·Π²Π΅Ρ€Π½ΡƒΡ‚ΡŒ большС инстансов
kubectl scale deployment event-processor --replicas=5

4. Disaster Recovery

4.1 RTO/RPO Ρ†Π΅Π»ΠΈ

ΠšΠΎΠΌΠΏΠΎΠ½Π΅Π½Ρ‚ RPO RTO
Cloud SQL 5 ΠΌΠΈΠ½ (PITR) 15 ΠΌΠΈΠ½
Redis 24 часа 30 ΠΌΠΈΠ½
Pub/Sub 0 (persistent) 5 ΠΌΠΈΠ½
Cloud Storage 0 5 ΠΌΠΈΠ½

4.2 DR Playbook - ΠŸΠΎΠ»Π½Ρ‹ΠΉ ΠΎΡ‚ΠΊΠ°Π· Ρ€Π΅Π³ΠΈΠΎΠ½Π°

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/bin/bash
# dr-failover.sh - ΠŸΠ΅Ρ€Π΅ΠΊΠ»ΡŽΡ‡Π΅Π½ΠΈΠ΅ Π½Π° DR Ρ€Π΅Π³ΠΈΠΎΠ½

set -euo pipefail

DR_REGION="us-east1"
DR_PROJECT="viktor-integration"

echo "🚨 Initiating DR Failover to $DR_REGION"

# 1. ΠŸΠ΅Ρ€Π΅ΠΊΠ»ΡŽΡ‡ΠΈΡ‚ΡŒ Cloud SQL
gcloud sql instances failover zashita-postgres-production-dr \
  --project=$DR_PROJECT

# 2. ΠžΠ±Π½ΠΎΠ²ΠΈΡ‚ΡŒ Redis endpoint
NEW_REDIS=$(gcloud redis instances describe zashita-redis-dr \
  --region=$DR_REGION --format='value(host)')
echo "New Redis: $NEW_REDIS"

# 3. ΠžΠ±Π½ΠΎΠ²ΠΈΡ‚ΡŒ DNS
gcloud dns record-sets update api.zashita.io \
  --zone=zashita-zone \
  --type=A \
  --rrdatas="NEW_IP" \
  --ttl=60

# 4. Notify team
curl -X POST $SLACK_WEBHOOK -d '{"text":"🚨 DR Failover Complete - Region: '$DR_REGION'"}'

echo "βœ… DR Failover Complete"

4.3 DR Test Checklist

  • Бэкапы Π²ΠΎΡΡΡ‚Π°Π½Π°Π²Π»ΠΈΠ²Π°ΡŽΡ‚ΡΡ ΠΊΠΎΡ€Ρ€Π΅ΠΊΡ‚Π½ΠΎ
  • Π Π΅ΠΏΠ»ΠΈΠΊΠΈ синхронизированы
  • DNS ΠΏΠ΅Ρ€Π΅ΠΊΠ»ΡŽΡ‡Π΅Π½ΠΈΠ΅ Ρ€Π°Π±ΠΎΡ‚Π°Π΅Ρ‚
  • ΠŸΡ€ΠΈΠ»ΠΎΠΆΠ΅Π½ΠΈΡ ΠΏΠ΅Ρ€Π΅ΠΊΠ»ΡŽΡ‡Π°ΡŽΡ‚ΡΡ Π½Π° DR
  • ΠœΠΎΠ½ΠΈΡ‚ΠΎΡ€ΠΈΠ½Π³ Π²ΠΈΠ΄ΠΈΡ‚ DR рСсурсы
  • ДокумСнтация Π°ΠΊΡ‚ΡƒΠ°Π»ΡŒΠ½Π°

5. Incident Response

5.1 Severity Levels

Level ОписаниС Response Time Example
SEV1 ΠŸΠΎΠ»Π½Ρ‹ΠΉ outage 15 ΠΌΠΈΠ½ ВсС API нСдоступны
SEV2 Частичный outage 30 ΠΌΠΈΠ½ 50% ошибок
SEV3 ДСградация 2 часа ΠœΠ΅Π΄Π»Π΅Π½Π½Ρ‹Π΅ запросы
SEV4 Minor issue 24 часа UI Π±Π°Π³

5.2 SEV1 Response - API Down

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
# 1. ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ статус сСрвисов
docker ps -a
kubectl get pods -A

# 2. ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ Cloud SQL
gcloud sql instances describe zashita-postgres-production \
  --format="value(state)"

# 3. ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ Π»ΠΎΠ³ΠΈ
gcloud logging read "severity>=ERROR" --limit=50 --freshness=10m

# 4. ΠŸΠΎΠΏΡ€ΠΎΠ±ΠΎΠ²Π°Ρ‚ΡŒ рСстарт
docker-compose -f docker-compose.gcp.yml restart z-core-api

# 5. Если Π½Π΅ ΠΏΠΎΠΌΠΎΠ³Π°Π΅Ρ‚ - failover
./scripts/dr-failover.sh

# 6. ΠšΠΎΠΌΠΌΡƒΠ½ΠΈΠΊΠ°Ρ†ΠΈΡ
# - ΠžΠ±Π½ΠΎΠ²ΠΈΡ‚ΡŒ status page
# - Π£Π²Π΅Π΄ΠΎΠΌΠΈΡ‚ΡŒ stakeholders
# - ΠΠ°Ρ‡Π°Ρ‚ΡŒ postmortem

5.3 SEV2 Response - High Error Rate

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# 1. Π˜Π΄Π΅Π½Ρ‚ΠΈΡ„ΠΈΡ†ΠΈΡ€ΠΎΠ²Π°Ρ‚ΡŒ источник
gcloud logging read \
  'resource.type="cloud_run_revision" AND severity>=ERROR' \
  --limit=100 --format=json | jq '.[] | .jsonPayload'

# 2. ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ ΠΌΠ΅Ρ‚Ρ€ΠΈΠΊΠΈ
gcloud monitoring dashboards list

# 3. ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΈΡ‚ΡŒ recent deployments
git log --oneline -10
kubectl rollout history deployment/api

# 4. Rollback Ссли Π½ΡƒΠΆΠ½ΠΎ
kubectl rollout undo deployment/api

# 5. Scale Ссли Π½Π°Π³Ρ€ΡƒΠ·ΠΊΠ°
kubectl scale deployment/api --replicas=10

5.4 Postmortem Template

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# Postmortem: [Title]

**Date:** YYYY-MM-DD
**Duration:** X hours Y minutes
**Severity:** SEVX
**Author:** Name

## Summary

Brief description of the incident.

## Impact

- X users affected
- Y% error rate
- $Z revenue impact

## Timeline

- HH:MM - Alert fired
- HH:MM - Engineer engaged
- HH:MM - Root cause identified
- HH:MM - Fix deployed
- HH:MM - Incident resolved

## Root Cause

Technical explanation of what went wrong.

## Resolution

What was done to fix the issue.

## Lessons Learned

### What went well

-

### What could be improved

-

## Action Items

- [ ] Action 1 - Owner - Due Date
- [ ] Action 2 - Owner - Due Date

6. Scaling Procedures

6.1 Π’Π΅Ρ€Ρ‚ΠΈΠΊΠ°Π»ΡŒΠ½ΠΎΠ΅ ΠΌΠ°ΡΡˆΡ‚Π°Π±ΠΈΡ€ΠΎΠ²Π°Π½ΠΈΠ΅

Cloud SQL

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
# Π’Π΅ΠΊΡƒΡ‰ΠΈΠΉ tier
gcloud sql instances describe zashita-postgres-production \
  --format="value(settings.tier)"

# Π£Π²Π΅Π»ΠΈΡ‡ΠΈΡ‚ΡŒ (Ρ‚Ρ€Π΅Π±ΡƒΠ΅Ρ‚ downtime ~1 ΠΌΠΈΠ½)
gcloud sql instances patch zashita-postgres-production \
  --tier=db-custom-4-16384

# Tiers доступныС:
# db-custom-2-8192   = 2 vCPU, 8 GB
# db-custom-4-16384  = 4 vCPU, 16 GB
# db-custom-8-32768  = 8 vCPU, 32 GB
# db-custom-16-65536 = 16 vCPU, 64 GB

Redis

1
2
3
4
5
6
7
8
# Π’Π΅ΠΊΡƒΡ‰ΠΈΠΉ Ρ€Π°Π·ΠΌΠ΅Ρ€
gcloud redis instances describe zashita-redis-production \
  --region=us-central1 --format="value(memorySizeGb)"

# Π£Π²Π΅Π»ΠΈΡ‡ΠΈΡ‚ΡŒ (zero-downtime для HA)
gcloud redis instances update zashita-redis-production \
  --region=us-central1 \
  --size=4

6.2 Π“ΠΎΡ€ΠΈΠ·ΠΎΠ½Ρ‚Π°Π»ΡŒΠ½ΠΎΠ΅ ΠΌΠ°ΡΡˆΡ‚Π°Π±ΠΈΡ€ΠΎΠ²Π°Π½ΠΈΠ΅

Cloud SQL Read Replicas

1
2
3
4
5
6
7
8
# Π‘ΠΎΠ·Π΄Π°Ρ‚ΡŒ Ρ€Π΅ΠΏΠ»ΠΈΠΊΡƒ
gcloud sql instances create zashita-postgres-replica \
  --master-instance-name=zashita-postgres-production \
  --tier=db-custom-2-8192 \
  --zone=us-central1-b

# ΠΠ°ΠΏΡ€Π°Π²ΠΈΡ‚ΡŒ read traffic Π½Π° Ρ€Π΅ΠΏΠ»ΠΈΠΊΡƒ (Π² ΠΏΡ€ΠΈΠ»ΠΎΠΆΠ΅Π½ΠΈΠΈ)
# DATABASE_URL_READ=postgres://...replica...

Application Scaling

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
# Kubernetes
kubectl scale deployment/z-core-api --replicas=10

# Cloud Run
gcloud run services update z-core-api \
  --min-instances=5 \
  --max-instances=100

# Docker Compose (local)
docker-compose up -d --scale z-core-api=3

6.3 Auto-scaling конфигурация

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# Kubernetes HPA
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: z-core-api-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: z-core-api
  minReplicas: 2
  maxReplicas: 20
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80

πŸ“ž ΠšΠΎΠ½Ρ‚Π°ΠΊΡ‚Ρ‹

Role Contact Escalation
On-Call Primary PagerDuty Automatic
On-Call Secondary Slack #oncall 30 min
Team Lead Direct SEV1/SEV2
Management Email SEV1

ПослСднСС ΠΎΠ±Π½ΠΎΠ²Π»Π΅Π½ΠΈΠ΅: 2026-02-07 ВСрсия: 1.0.0