π¨ GCP Infrastructure Runbooks
ΠΡΠΎΡΠ΅ΡΡΠΈΠΎΠ½Π°Π»ΡΠ½ΡΠ΅ runbooks Π΄Π»Ρ ΠΎΠΏΠ΅ΡΠ°ΡΠΈΠΉ Ρ GCP ΠΈΠ½ΡΡΠ°ΡΡΡΡΠΊΡΡΡΠΎΠΉ
π Π‘ΠΎΠ΄Π΅ΡΠΆΠ°Π½ΠΈΠ΅
1. Cloud SQL Operations
1.1 ΠΠΎΠ΄ΠΊΠ»ΡΡΠ΅Π½ΠΈΠ΅ ΠΊ Π±Π°Π·Π΅ Π΄Π°Π½Π½ΡΡ
| # Π§Π΅ΡΠ΅Π· Cloud SQL Proxy (ΡΠ΅ΠΊΠΎΠΌΠ΅Π½Π΄ΡΠ΅ΡΡΡ)
cloud-sql-proxy viktor-integration:us-central1:zashita-postgres-production &
psql -h 127.0.0.1 -U app_user -d zashita_production
# Π§Π΅ΡΠ΅Π· gcloud (Π΄Π»Ρ Π°Π΄ΠΌΠΈΠ½ΠΈΡΡΡΠΈΡΠΎΠ²Π°Π½ΠΈΡ)
gcloud sql connect zashita-postgres-production --user=postgres --project=viktor-integration
|
1.2 Failover Π½Π° ΡΠ΅ΠΏΠ»ΠΈΠΊΡ
ΠΠΎΠ³Π΄Π° ΠΈΡΠΏΠΎΠ»ΡΠ·ΠΎΠ²Π°ΡΡ: Primary Π½Π΅Π΄ΠΎΡΡΡΠΏΠ΅Π½ Π±ΠΎΠ»Π΅Π΅ 5 ΠΌΠΈΠ½ΡΡ
| # ΠΡΠΎΠ²Π΅ΡΠΈΡΡ ΡΡΠ°ΡΡΡ
gcloud sql instances describe zashita-postgres-production \
--format="value(state,failoverReplica.name)"
# ΠΡΠΈΠ½ΡΠ΄ΠΈΡΠ΅Π»ΡΠ½ΡΠΉ failover
gcloud sql instances failover zashita-postgres-production \
--project=viktor-integration
# ΠΡΠΎΠ²Π΅ΡΠΈΡΡ Π½ΠΎΠ²ΡΠΉ primary
gcloud sql instances describe zashita-postgres-production \
--format="table(name,state,ipAddresses[0].ipAddress)"
|
1.3 ΠΠΎΡΡΡΠ°Π½ΠΎΠ²Π»Π΅Π½ΠΈΠ΅ ΠΈΠ· Π±ΡΠΊΠ°ΠΏΠ°
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 | # Π‘ΠΏΠΈΡΠΎΠΊ Π±ΡΠΊΠ°ΠΏΠΎΠ²
gcloud sql backups list \
--instance=zashita-postgres-production \
--project=viktor-integration
# ΠΠΎΡΡΡΠ°Π½ΠΎΠ²ΠΈΡΡ Π² Π½ΠΎΠ²ΡΠΉ ΠΈΠ½ΡΡΠ°Π½Ρ (ΡΠ΅ΠΊΠΎΠΌΠ΅Π½Π΄ΡΠ΅ΡΡΡ)
gcloud sql instances restore-backup zashita-postgres-production \
--restore-instance=zashita-postgres-restored \
--backup-id=BACKUP_ID \
--project=viktor-integration
# ΠΠ»ΠΈ Π²ΠΎΡΡΡΠ°Π½ΠΎΠ²ΠΈΡΡ point-in-time
gcloud sql instances clone zashita-postgres-production zashita-postgres-pitr \
--point-in-time="2026-02-07T10:00:00Z" \
--project=viktor-integration
|
1.4 ΠΠ²Π°ΡΠΈΠΉΠ½ΠΎΠ΅ ΠΌΠ°ΡΡΡΠ°Π±ΠΈΡΠΎΠ²Π°Π½ΠΈΠ΅
| # Π£Π²Π΅Π»ΠΈΡΠΈΡΡ CPU/RAM
gcloud sql instances patch zashita-postgres-production \
--tier=db-custom-4-16384 \
--project=viktor-integration
# Π£Π²Π΅Π»ΠΈΡΠΈΡΡ storage
gcloud sql instances patch zashita-postgres-production \
--storage-size=200GB \
--project=viktor-integration
|
1.5 ΠΠ΅Π΄Π»Π΅Π½Π½ΡΠ΅ Π·Π°ΠΏΡΠΎΡΡ
| -- ΠΠ°ΠΉΡΠΈ ΠΌΠ΅Π΄Π»Π΅Π½Π½ΡΠ΅ Π·Π°ΠΏΡΠΎΡΡ
SELECT pid, now() - pg_stat_activity.query_start AS duration, query
FROM pg_stat_activity
WHERE state = 'active' AND (now() - pg_stat_activity.query_start) > interval '5 seconds';
-- Π£Π±ΠΈΡΡ Π·Π°ΠΏΡΠΎΡ
SELECT pg_terminate_backend(pid);
-- ΠΡΠΎΠ²Π΅ΡΠΈΡΡ Π±Π»ΠΎΠΊΠΈΡΠΎΠ²ΠΊΠΈ
SELECT * FROM pg_locks WHERE NOT granted;
|
2. Redis Operations
2.1 ΠΠΎΠ΄ΠΊΠ»ΡΡΠ΅Π½ΠΈΠ΅ ΠΊ Redis
| # ΠΠΎΠ»ΡΡΠΈΡΡ Ρ
ΠΎΡΡ
REDIS_HOST=$(gcloud redis instances describe zashita-redis-production \
--region=us-central1 --format='value(host)')
# ΠΠΎΠ΄ΠΊΠ»ΡΡΠ΅Π½ΠΈΠ΅ ΡΠ΅ΡΠ΅Π· jump host
ssh -L 6379:$REDIS_HOST:6379 user@jump-host
# ΠΠ»ΠΈ ΡΠ΅ΡΠ΅Π· Compute Engine VM
gcloud compute ssh redis-bastion --zone=us-central1-a -- \
redis-cli -h $REDIS_HOST
|
2.2 Failover Redis HA
| # ΠΡΠΎΠ²Π΅ΡΠΈΡΡ ΡΡΠ°ΡΡΡ
gcloud redis instances describe zashita-redis-production \
--region=us-central1 \
--format="table(name,state,host,currentLocationId)"
# ΠΡΠΈΠ½ΡΠ΄ΠΈΡΠ΅Π»ΡΠ½ΡΠΉ failover
gcloud redis instances failover zashita-redis-production \
--region=us-central1 \
--data-protection-mode=limited-data-loss
# ΠΠΎΡΠ»Π΅ failover ΠΎΠ±Π½ΠΎΠ²ΠΈΡΡ DNS/ΠΊΠΎΠ½ΡΠΈΠ³
|
2.3 ΠΡΠΈΡΡΠΊΠ° ΠΏΠ°ΠΌΡΡΠΈ
1
2
3
4
5
6
7
8
9
10
11
12
13 | redis-cli -h $REDIS_HOST
# ΠΡΠΎΠ²Π΅ΡΠΈΡΡ ΠΈΡΠΏΠΎΠ»ΡΠ·ΠΎΠ²Π°Π½ΠΈΠ΅ ΠΏΠ°ΠΌΡΡΠΈ
INFO memory
# ΠΠ°ΠΉΡΠΈ Π±ΠΎΠ»ΡΡΠΈΠ΅ ΠΊΠ»ΡΡΠΈ
redis-cli -h $REDIS_HOST --bigkeys
# ΠΡΠΈΡΡΠΈΡΡ ΠΊΠΎΠ½ΠΊΡΠ΅ΡΠ½ΡΠΉ ΠΏΠ°ΡΡΠ΅ΡΠ½
redis-cli -h $REDIS_HOST KEYS "session:expired:*" | xargs redis-cli DEL
# ΠΡΠΈΠ½ΡΠ΄ΠΈΡΠ΅Π»ΡΠ½Π°Ρ Π΄Π΅ΡΡΠ°Π³ΠΌΠ΅Π½ΡΠ°ΡΠΈΡ
CONFIG SET activedefrag yes
|
2.4 Export/Import Π΄Π°Π½Π½ΡΡ
| # Export Π² GCS
gcloud redis instances export zashita-redis-production \
gs://zashita-backups-production/redis/export-$(date +%Y%m%d).rdb \
--region=us-central1
# Import ΠΈΠ· GCS
gcloud redis instances import zashita-redis-production \
gs://zashita-backups-production/redis/export-20260207.rdb \
--region=us-central1
|
3. Pub/Sub Operations
3.1 ΠΠΎΠ½ΠΈΡΠΎΡΠΈΠ½Π³ ΠΎΡΠ΅ΡΠ΅Π΄Π΅ΠΉ
1
2
3
4
5
6
7
8
9
10
11
12
13 | # ΠΡΠΎΠ²Π΅ΡΠΈΡΡ backlog
gcloud pubsub subscriptions describe events-worker-subscription \
--format="table(name,ackDeadlineSeconds)" \
--project=viktor-integration
# ΠΠ΅ΡΡΠΈΠΊΠΈ
gcloud monitoring metrics list \
--filter="metric.type:pubsub" \
--project=viktor-integration | head -20
# Π‘ΠΏΠΈΡΠΎΠΊ unacked ΡΠΎΠΎΠ±ΡΠ΅Π½ΠΈΠΉ
gcloud pubsub subscriptions pull events-worker-subscription \
--limit=10 --auto-ack=false
|
3.2 Dead Letter Queue ΠΎΠ±ΡΠ°Π±ΠΎΡΠΊΠ°
1
2
3
4
5
6
7
8
9
10
11
12
13
14 | # ΠΡΠΎΡΠΌΠΎΡΡΠ΅ΡΡ DLQ
gcloud pubsub subscriptions pull events-dlq-subscription \
--limit=100 --format=json > dlq_messages.json
# ΠΠΎΠ²ΡΠΎΡΠ½Π°Ρ ΠΏΡΠ±Π»ΠΈΠΊΠ°ΡΠΈΡ ΠΏΠΎΡΠ»Π΅ ΡΠΈΠΊΡΠ°
cat dlq_messages.json | jq -r '.[] | .message.data' | \
while read msg; do
echo "$msg" | base64 -d | \
gcloud pubsub topics publish events-topic --message="$(cat)"
done
# ΠΡΠΈΡΡΠΈΡΡ DLQ (ΠΏΠΎΡΠ»Π΅ ΠΎΠ±ΡΠ°Π±ΠΎΡΠΊΠΈ)
gcloud pubsub subscriptions seek events-dlq-subscription \
--time=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
3.3 ΠΠ°ΡΡΡΠ°Π±ΠΈΡΠΎΠ²Π°Π½ΠΈΠ΅ ΠΏΠΎΠ΄ΠΏΠΈΡΡΠΈΠΊΠΎΠ²
| # Π£Π²Π΅Π»ΠΈΡΠΈΡΡ ΠΏΠ°ΡΠ°Π»Π»Π΅Π»ΠΈΠ·ΠΌ (Π² ΠΏΡΠΈΠ»ΠΎΠΆΠ΅Π½ΠΈΠΈ)
# ΠΠ±Π½ΠΎΠ²ΠΈΡΡ flowControl Π² subscriber config:
# maxOutstandingMessages: 1000 -> 5000
# maxOutstandingBytes: 100MB -> 500MB
# ΠΠ»ΠΈ ΡΠ°Π·Π²Π΅ΡΠ½ΡΡΡ Π±ΠΎΠ»ΡΡΠ΅ ΠΈΠ½ΡΡΠ°Π½ΡΠΎΠ²
kubectl scale deployment event-processor --replicas=5
|
4. Disaster Recovery
4.1 RTO/RPO ΡΠ΅Π»ΠΈ
| ΠΠΎΠΌΠΏΠΎΠ½Π΅Π½Ρ |
RPO |
RTO |
| Cloud SQL |
5 ΠΌΠΈΠ½ (PITR) |
15 ΠΌΠΈΠ½ |
| Redis |
24 ΡΠ°ΡΠ° |
30 ΠΌΠΈΠ½ |
| Pub/Sub |
0 (persistent) |
5 ΠΌΠΈΠ½ |
| Cloud Storage |
0 |
5 ΠΌΠΈΠ½ |
4.2 DR Playbook - ΠΠΎΠ»Π½ΡΠΉ ΠΎΡΠΊΠ°Π· ΡΠ΅Π³ΠΈΠΎΠ½Π°
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 | #!/bin/bash
# dr-failover.sh - ΠΠ΅ΡΠ΅ΠΊΠ»ΡΡΠ΅Π½ΠΈΠ΅ Π½Π° DR ΡΠ΅Π³ΠΈΠΎΠ½
set -euo pipefail
DR_REGION="us-east1"
DR_PROJECT="viktor-integration"
echo "π¨ Initiating DR Failover to $DR_REGION"
# 1. ΠΠ΅ΡΠ΅ΠΊΠ»ΡΡΠΈΡΡ Cloud SQL
gcloud sql instances failover zashita-postgres-production-dr \
--project=$DR_PROJECT
# 2. ΠΠ±Π½ΠΎΠ²ΠΈΡΡ Redis endpoint
NEW_REDIS=$(gcloud redis instances describe zashita-redis-dr \
--region=$DR_REGION --format='value(host)')
echo "New Redis: $NEW_REDIS"
# 3. ΠΠ±Π½ΠΎΠ²ΠΈΡΡ DNS
gcloud dns record-sets update api.zashita.io \
--zone=zashita-zone \
--type=A \
--rrdatas="NEW_IP" \
--ttl=60
# 4. Notify team
curl -X POST $SLACK_WEBHOOK -d '{"text":"π¨ DR Failover Complete - Region: '$DR_REGION'"}'
echo "β
DR Failover Complete"
|
4.3 DR Test Checklist
5. Incident Response
5.1 Severity Levels
| Level |
ΠΠΏΠΈΡΠ°Π½ΠΈΠ΅ |
Response Time |
Example |
| SEV1 |
ΠΠΎΠ»Π½ΡΠΉ outage |
15 ΠΌΠΈΠ½ |
ΠΡΠ΅ API Π½Π΅Π΄ΠΎΡΡΡΠΏΠ½Ρ |
| SEV2 |
Π§Π°ΡΡΠΈΡΠ½ΡΠΉ outage |
30 ΠΌΠΈΠ½ |
50% ΠΎΡΠΈΠ±ΠΎΠΊ |
| SEV3 |
ΠΠ΅Π³ΡΠ°Π΄Π°ΡΠΈΡ |
2 ΡΠ°ΡΠ° |
ΠΠ΅Π΄Π»Π΅Π½Π½ΡΠ΅ Π·Π°ΠΏΡΠΎΡΡ |
| SEV4 |
Minor issue |
24 ΡΠ°ΡΠ° |
UI Π±Π°Π³ |
5.2 SEV1 Response - API Down
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 | # 1. ΠΡΠΎΠ²Π΅ΡΠΈΡΡ ΡΡΠ°ΡΡΡ ΡΠ΅ΡΠ²ΠΈΡΠΎΠ²
docker ps -a
kubectl get pods -A
# 2. ΠΡΠΎΠ²Π΅ΡΠΈΡΡ Cloud SQL
gcloud sql instances describe zashita-postgres-production \
--format="value(state)"
# 3. ΠΡΠΎΠ²Π΅ΡΠΈΡΡ Π»ΠΎΠ³ΠΈ
gcloud logging read "severity>=ERROR" --limit=50 --freshness=10m
# 4. ΠΠΎΠΏΡΠΎΠ±ΠΎΠ²Π°ΡΡ ΡΠ΅ΡΡΠ°ΡΡ
docker-compose -f docker-compose.gcp.yml restart z-core-api
# 5. ΠΡΠ»ΠΈ Π½Π΅ ΠΏΠΎΠΌΠΎΠ³Π°Π΅Ρ - failover
./scripts/dr-failover.sh
# 6. ΠΠΎΠΌΠΌΡΠ½ΠΈΠΊΠ°ΡΠΈΡ
# - ΠΠ±Π½ΠΎΠ²ΠΈΡΡ status page
# - Π£Π²Π΅Π΄ΠΎΠΌΠΈΡΡ stakeholders
# - ΠΠ°ΡΠ°ΡΡ postmortem
|
5.3 SEV2 Response - High Error Rate
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | # 1. ΠΠ΄Π΅Π½ΡΠΈΡΠΈΡΠΈΡΠΎΠ²Π°ΡΡ ΠΈΡΡΠΎΡΠ½ΠΈΠΊ
gcloud logging read \
'resource.type="cloud_run_revision" AND severity>=ERROR' \
--limit=100 --format=json | jq '.[] | .jsonPayload'
# 2. ΠΡΠΎΠ²Π΅ΡΠΈΡΡ ΠΌΠ΅ΡΡΠΈΠΊΠΈ
gcloud monitoring dashboards list
# 3. ΠΡΠΎΠ²Π΅ΡΠΈΡΡ recent deployments
git log --oneline -10
kubectl rollout history deployment/api
# 4. Rollback Π΅ΡΠ»ΠΈ Π½ΡΠΆΠ½ΠΎ
kubectl rollout undo deployment/api
# 5. Scale Π΅ΡΠ»ΠΈ Π½Π°Π³ΡΡΠ·ΠΊΠ°
kubectl scale deployment/api --replicas=10
|
5.4 Postmortem Template
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47 | # Postmortem: [Title]
**Date:** YYYY-MM-DD
**Duration:** X hours Y minutes
**Severity:** SEVX
**Author:** Name
## Summary
Brief description of the incident.
## Impact
- X users affected
- Y% error rate
- $Z revenue impact
## Timeline
- HH:MM - Alert fired
- HH:MM - Engineer engaged
- HH:MM - Root cause identified
- HH:MM - Fix deployed
- HH:MM - Incident resolved
## Root Cause
Technical explanation of what went wrong.
## Resolution
What was done to fix the issue.
## Lessons Learned
### What went well
-
### What could be improved
-
## Action Items
- [ ] Action 1 - Owner - Due Date
- [ ] Action 2 - Owner - Due Date
|
6. Scaling Procedures
6.1 ΠΠ΅ΡΡΠΈΠΊΠ°Π»ΡΠ½ΠΎΠ΅ ΠΌΠ°ΡΡΡΠ°Π±ΠΈΡΠΎΠ²Π°Π½ΠΈΠ΅
Cloud SQL
1
2
3
4
5
6
7
8
9
10
11
12
13 | # Π’Π΅ΠΊΡΡΠΈΠΉ tier
gcloud sql instances describe zashita-postgres-production \
--format="value(settings.tier)"
# Π£Π²Π΅Π»ΠΈΡΠΈΡΡ (ΡΡΠ΅Π±ΡΠ΅Ρ downtime ~1 ΠΌΠΈΠ½)
gcloud sql instances patch zashita-postgres-production \
--tier=db-custom-4-16384
# Tiers Π΄ΠΎΡΡΡΠΏΠ½ΡΠ΅:
# db-custom-2-8192 = 2 vCPU, 8 GB
# db-custom-4-16384 = 4 vCPU, 16 GB
# db-custom-8-32768 = 8 vCPU, 32 GB
# db-custom-16-65536 = 16 vCPU, 64 GB
|
Redis
| # Π’Π΅ΠΊΡΡΠΈΠΉ ΡΠ°Π·ΠΌΠ΅Ρ
gcloud redis instances describe zashita-redis-production \
--region=us-central1 --format="value(memorySizeGb)"
# Π£Π²Π΅Π»ΠΈΡΠΈΡΡ (zero-downtime Π΄Π»Ρ HA)
gcloud redis instances update zashita-redis-production \
--region=us-central1 \
--size=4
|
6.2 ΠΠΎΡΠΈΠ·ΠΎΠ½ΡΠ°Π»ΡΠ½ΠΎΠ΅ ΠΌΠ°ΡΡΡΠ°Π±ΠΈΡΠΎΠ²Π°Π½ΠΈΠ΅
Cloud SQL Read Replicas
| # Π‘ΠΎΠ·Π΄Π°ΡΡ ΡΠ΅ΠΏΠ»ΠΈΠΊΡ
gcloud sql instances create zashita-postgres-replica \
--master-instance-name=zashita-postgres-production \
--tier=db-custom-2-8192 \
--zone=us-central1-b
# ΠΠ°ΠΏΡΠ°Π²ΠΈΡΡ read traffic Π½Π° ΡΠ΅ΠΏΠ»ΠΈΠΊΡ (Π² ΠΏΡΠΈΠ»ΠΎΠΆΠ΅Π½ΠΈΠΈ)
# DATABASE_URL_READ=postgres://...replica...
|
Application Scaling
| # Kubernetes
kubectl scale deployment/z-core-api --replicas=10
# Cloud Run
gcloud run services update z-core-api \
--min-instances=5 \
--max-instances=100
# Docker Compose (local)
docker-compose up -d --scale z-core-api=3
|
6.3 Auto-scaling ΠΊΠΎΠ½ΡΠΈΠ³ΡΡΠ°ΡΠΈΡ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 | # Kubernetes HPA
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: z-core-api-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: z-core-api
minReplicas: 2
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
|
π ΠΠΎΠ½ΡΠ°ΠΊΡΡ
| Role |
Contact |
Escalation |
| On-Call Primary |
PagerDuty |
Automatic |
| On-Call Secondary |
Slack #oncall |
30 min |
| Team Lead |
Direct |
SEV1/SEV2 |
| Management |
Email |
SEV1 |
ΠΠΎΡΠ»Π΅Π΄Π½Π΅Π΅ ΠΎΠ±Π½ΠΎΠ²Π»Π΅Π½ΠΈΠ΅: 2026-02-07
ΠΠ΅ΡΡΠΈΡ: 1.0.0