🚨 GCP Infrastructure Runbooks¶

Профессиональные runbooks для операций с GCP инфраструктурой

📋 Содержание¶

1. Cloud SQL Operations
2. Redis Operations
3. Pub/Sub Operations
4. Disaster Recovery
5. Incident Response
6. Scaling Procedures

1. Cloud SQL Operations¶

1.1 Подключение к базе данных¶

# Через Cloud SQL Proxy (рекомендуется)
cloud-sql-proxy viktor-integration:us-central1:zashita-postgres-production &
psql -h 127.0.0.1 -U app_user -d zashita_production

# Через gcloud (для администрирования)
gcloud sql connect zashita-postgres-production --user=postgres --project=viktor-integration

1.2 Failover на реплику¶

Когда использовать: Primary недоступен более 5 минут

# Проверить статус
gcloud sql instances describe zashita-postgres-production \
  --format="value(state,failoverReplica.name)"

# Принудительный failover
gcloud sql instances failover zashita-postgres-production \
  --project=viktor-integration

# Проверить новый primary
gcloud sql instances describe zashita-postgres-production \
  --format="table(name,state,ipAddresses[0].ipAddress)"

1.3 Восстановление из бэкапа¶

# Список бэкапов
gcloud sql backups list \
  --instance=zashita-postgres-production \
  --project=viktor-integration

# Восстановить в новый инстанс (рекомендуется)
gcloud sql instances restore-backup zashita-postgres-production \
  --restore-instance=zashita-postgres-restored \
  --backup-id=BACKUP_ID \
  --project=viktor-integration

# Или восстановить point-in-time
gcloud sql instances clone zashita-postgres-production zashita-postgres-pitr \
  --point-in-time="2026-02-07T10:00:00Z" \
  --project=viktor-integration

1.4 Аварийное масштабирование¶

# Увеличить CPU/RAM
gcloud sql instances patch zashita-postgres-production \
  --tier=db-custom-4-16384 \
  --project=viktor-integration

# Увеличить storage
gcloud sql instances patch zashita-postgres-production \
  --storage-size=200GB \
  --project=viktor-integration

1.5 Медленные запросы¶

-- Найти медленные запросы
SELECT pid, now() - pg_stat_activity.query_start AS duration, query
FROM pg_stat_activity
WHERE state = 'active' AND (now() - pg_stat_activity.query_start) > interval '5 seconds';

-- Убить запрос
SELECT pg_terminate_backend(pid);

-- Проверить блокировки
SELECT * FROM pg_locks WHERE NOT granted;

2. Redis Operations¶

2.1 Подключение к Redis¶

# Получить хост
REDIS_HOST=$(gcloud redis instances describe zashita-redis-production \
  --region=us-central1 --format='value(host)')

# Подключение через jump host
ssh -L 6379:$REDIS_HOST:6379 user@jump-host

# Или через Compute Engine VM
gcloud compute ssh redis-bastion --zone=us-central1-a -- \
  redis-cli -h $REDIS_HOST

2.2 Failover Redis HA¶

# Проверить статус
gcloud redis instances describe zashita-redis-production \
  --region=us-central1 \
  --format="table(name,state,host,currentLocationId)"

# Принудительный failover
gcloud redis instances failover zashita-redis-production \
  --region=us-central1 \
  --data-protection-mode=limited-data-loss

# После failover обновить DNS/конфиг

2.3 Очистка памяти¶

redis-cli -h $REDIS_HOST

# Проверить использование памяти
INFO memory

# Найти большие ключи
redis-cli -h $REDIS_HOST --bigkeys

# Очистить конкретный паттерн
redis-cli -h $REDIS_HOST KEYS "session:expired:*" | xargs redis-cli DEL

# Принудительная дефрагментация
CONFIG SET activedefrag yes

2.4 Export/Import данных¶

# Export в GCS
gcloud redis instances export zashita-redis-production \
  gs://zashita-backups-production/redis/export-$(date +%Y%m%d).rdb \
  --region=us-central1

# Import из GCS
gcloud redis instances import zashita-redis-production \
  gs://zashita-backups-production/redis/export-20260207.rdb \
  --region=us-central1

3. Pub/Sub Operations¶

3.1 Мониторинг очередей¶

# Проверить backlog
gcloud pubsub subscriptions describe events-worker-subscription \
  --format="table(name,ackDeadlineSeconds)" \
  --project=viktor-integration

# Метрики
gcloud monitoring metrics list \
  --filter="metric.type:pubsub" \
  --project=viktor-integration | head -20

# Список unacked сообщений
gcloud pubsub subscriptions pull events-worker-subscription \
  --limit=10 --auto-ack=false

3.2 Dead Letter Queue обработка¶

# Просмотреть DLQ
gcloud pubsub subscriptions pull events-dlq-subscription \
  --limit=100 --format=json > dlq_messages.json

# Повторная публикация после фикса
cat dlq_messages.json | jq -r '.[] | .message.data' | \
  while read msg; do
    echo "$msg" | base64 -d | \
    gcloud pubsub topics publish events-topic --message="$(cat)"
  done

# Очистить DLQ (после обработки)
gcloud pubsub subscriptions seek events-dlq-subscription \
  --time=$(date -u +%Y-%m-%dT%H:%M:%SZ)

3.3 Масштабирование подписчиков¶

# Увеличить параллелизм (в приложении)
# Обновить flowControl в subscriber config:
# maxOutstandingMessages: 1000 -> 5000
# maxOutstandingBytes: 100MB -> 500MB

# Или развернуть больше инстансов
kubectl scale deployment event-processor --replicas=5

4. Disaster Recovery¶

4.1 RTO/RPO цели¶

Компонент	RPO	RTO
Cloud SQL	5 мин (PITR)	15 мин
Redis	24 часа	30 мин
Pub/Sub	0 (persistent)	5 мин
Cloud Storage	0	5 мин

4.2 DR Playbook - Полный отказ региона¶

#!/bin/bash
# dr-failover.sh - Переключение на DR регион

set -euo pipefail

DR_REGION="us-east1"
DR_PROJECT="viktor-integration"

echo "🚨 Initiating DR Failover to $DR_REGION"

# 1. Переключить Cloud SQL
gcloud sql instances failover zashita-postgres-production-dr \
  --project=$DR_PROJECT

# 2. Обновить Redis endpoint
NEW_REDIS=$(gcloud redis instances describe zashita-redis-dr \
  --region=$DR_REGION --format='value(host)')
echo "New Redis: $NEW_REDIS"

# 3. Обновить DNS
gcloud dns record-sets update api.zashita.io \
  --zone=zashita-zone \
  --type=A \
  --rrdatas="NEW_IP" \
  --ttl=60

# 4. Notify team
curl -X POST $SLACK_WEBHOOK -d '{"text":"🚨 DR Failover Complete - Region: '$DR_REGION'"}'

echo "✅ DR Failover Complete"

4.3 DR Test Checklist¶

Бэкапы восстанавливаются корректно
Реплики синхронизированы
DNS переключение работает
Приложения переключаются на DR
Мониторинг видит DR ресурсы
Документация актуальна

5. Incident Response¶

5.1 Severity Levels¶

Level	Описание	Response Time	Example
SEV1	Полный outage	15 мин	Все API недоступны
SEV2	Частичный outage	30 мин	50% ошибок
SEV3	Деградация	2 часа	Медленные запросы
SEV4	Minor issue	24 часа	UI баг

5.2 SEV1 Response - API Down¶

# 1. Проверить статус сервисов
docker ps -a
kubectl get pods -A

# 2. Проверить Cloud SQL
gcloud sql instances describe zashita-postgres-production \
  --format="value(state)"

# 3. Проверить логи
gcloud logging read "severity>=ERROR" --limit=50 --freshness=10m

# 4. Попробовать рестарт
docker-compose -f docker-compose.gcp.yml restart z-core-api

# 5. Если не помогает - failover
./scripts/dr-failover.sh

# 6. Коммуникация
# - Обновить status page
# - Уведомить stakeholders
# - Начать postmortem

5.3 SEV2 Response - High Error Rate¶

# 1. Идентифицировать источник
gcloud logging read \
  'resource.type="cloud_run_revision" AND severity>=ERROR' \
  --limit=100 --format=json | jq '.[] | .jsonPayload'

# 2. Проверить метрики
gcloud monitoring dashboards list

# 3. Проверить recent deployments
git log --oneline -10
kubectl rollout history deployment/api

# 4. Rollback если нужно
kubectl rollout undo deployment/api

# 5. Scale если нагрузка
kubectl scale deployment/api --replicas=10

5.4 Postmortem Template¶

# Postmortem: [Title]

**Date:** YYYY-MM-DD
**Duration:** X hours Y minutes
**Severity:** SEVX
**Author:** Name

## Summary

Brief description of the incident.

## Impact

- X users affected
- Y% error rate
- $Z revenue impact

## Timeline

- HH:MM - Alert fired
- HH:MM - Engineer engaged
- HH:MM - Root cause identified
- HH:MM - Fix deployed
- HH:MM - Incident resolved

## Root Cause

Technical explanation of what went wrong.

## Resolution

What was done to fix the issue.

## Lessons Learned

### What went well

-

### What could be improved

-

## Action Items

- [ ] Action 1 - Owner - Due Date
- [ ] Action 2 - Owner - Due Date

6. Scaling Procedures¶

6.1 Вертикальное масштабирование¶

Cloud SQL¶

# Текущий tier
gcloud sql instances describe zashita-postgres-production \
  --format="value(settings.tier)"

# Увеличить (требует downtime ~1 мин)
gcloud sql instances patch zashita-postgres-production \
  --tier=db-custom-4-16384

# Tiers доступные:
# db-custom-2-8192   = 2 vCPU, 8 GB
# db-custom-4-16384  = 4 vCPU, 16 GB
# db-custom-8-32768  = 8 vCPU, 32 GB
# db-custom-16-65536 = 16 vCPU, 64 GB

Redis¶

# Текущий размер
gcloud redis instances describe zashita-redis-production \
  --region=us-central1 --format="value(memorySizeGb)"

# Увеличить (zero-downtime для HA)
gcloud redis instances update zashita-redis-production \
  --region=us-central1 \
  --size=4

6.2 Горизонтальное масштабирование¶

Cloud SQL Read Replicas¶

# Создать реплику
gcloud sql instances create zashita-postgres-replica \
  --master-instance-name=zashita-postgres-production \
  --tier=db-custom-2-8192 \
  --zone=us-central1-b

# Направить read traffic на реплику (в приложении)
# DATABASE_URL_READ=postgres://...replica...

Application Scaling¶

# Kubernetes
kubectl scale deployment/z-core-api --replicas=10

# Cloud Run
gcloud run services update z-core-api \
  --min-instances=5 \
  --max-instances=100

# Docker Compose (local)
docker-compose up -d --scale z-core-api=3

6.3 Auto-scaling конфигурация¶

# Kubernetes HPA
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: z-core-api-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: z-core-api
  minReplicas: 2
  maxReplicas: 20
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80

📞 Контакты¶

Role	Contact	Escalation
On-Call Primary	PagerDuty	Automatic
On-Call Secondary	Slack #oncall	30 min
Team Lead	Direct	SEV1/SEV2
Management	Email	SEV1

Последнее обновление: 2026-02-07 Версия: 1.0.0