Skip to content

Commit 2a35d3e

Browse files
committed
调整集群备份/恢复脚本及文档
1 parent 45587b7 commit 2a35d3e

File tree

9 files changed

+53
-237
lines changed

9 files changed

+53
-237
lines changed

23.backup.yml

+19-34
Original file line numberDiff line numberDiff line change
@@ -3,49 +3,34 @@
33

44
- hosts:
55
- etcd
6-
roles:
7-
- cluster-backup
6+
tasks:
7+
- name: 执行etcd 数据备份
8+
shell: "mkdir -p /etcd_backup && cd /etcd_backup && \
9+
ETCDCTL_API=3 {{ bin_dir }}/etcdctl snapshot save snapshot.db"
10+
args:
11+
warn: false
12+
13+
- name: 获取etcd 数据备份
14+
fetch:
15+
src: /etcd_backup/snapshot.db
16+
dest: "{{ base_dir }}/.cluster/backup/"
17+
flat: yes
18+
run_once: true
819

920
- hosts:
10-
- deploy
21+
- localhost
1122
tasks:
12-
- name: Creating backup dirs
13-
file: name={{ item }} state=directory
14-
with_items:
15-
- "{{ base_dir }}/roles/cluster-backup/files/ca"
16-
- "{{ base_dir }}/roles/cluster-backup/files/hosts"
17-
- "{{ base_dir }}/roles/cluster-backup/files/snapshot"
18-
19-
- name: Backing up CA sth
20-
copy:
21-
src: "{{ ca_dir }}/{{ item }}"
22-
dest: "{{ base_dir }}/roles/cluster-backup/files/ca/{{ item }}"
23-
with_items:
24-
- ca.pem
25-
- ca-key.pem
26-
- ca.csr
27-
- ca-csr.json
28-
- ca-config.json
29-
3023
- name: Backing up ansible hosts-1
3124
copy:
3225
src: "{{ base_dir }}/hosts"
33-
dest: "{{ base_dir }}/roles/cluster-backup/files/hosts/hosts"
26+
dest: "{{ base_dir }}/.cluster/backup/hosts"
3427
register: p
3528

3629
- name: Backing up ansible hosts-2
37-
shell: "cd {{ base_dir }}/roles/cluster-backup/files/hosts && \
30+
shell: "cd {{ base_dir }}/.cluster/backup && \
3831
cp -fp hosts hosts-$(date +'%Y%m%d%H%M')"
3932
when: 'p is changed'
4033

41-
- name: Backing up etcd snapshot-1
42-
copy:
43-
src: "{{ base_dir }}/roles/cluster-backup/files/snapshot.db"
44-
dest: "{{ base_dir }}/roles/cluster-backup/files/snapshot/snapshot.db"
45-
register: q
46-
47-
- name: Backing up etcd snapshot-2
48-
shell: "cd {{ base_dir }}/roles/cluster-backup/files/ && \
49-
mv -f snapshot.db snapshot/snapshot-$(date +'%Y%m%d%H%M').db"
50-
when: 'q is changed'
51-
34+
- name: Backing up etcd snapshot with datetime
35+
shell: "cd {{ base_dir }}/.cluster/backup && \
36+
cp -fp snapshot.db snapshot-$(date +'%Y%m%d%H%M').db"

24.restore.yml

-71
Original file line numberDiff line numberDiff line change
@@ -1,77 +1,6 @@
11
# cluster-restore playbook
22
# read the guide: 'op/cluster_restore.md'
33

4-
# to restore CA sth on 'deploy' node
5-
- hosts: deploy
6-
tasks:
7-
- name: Restoring dirs of CA sth
8-
file: name=/etc/kubernetes/ssl/ state=directory
9-
10-
- name: Restoring CA sth
11-
copy:
12-
src: "{{ base_dir }}/roles/cluster-backup/files/ca/{{ item }}"
13-
dest: "{{ ca_dir }}/{{ item }}"
14-
with_items:
15-
- ca.pem
16-
- ca-key.pem
17-
- ca.csr
18-
- ca-csr.json
19-
- ca-config.json
20-
21-
- hosts: deploy
22-
roles:
23-
- deploy
24-
25-
# pre-tasks on all nodes
26-
- hosts:
27-
- kube-master
28-
- kube-node
29-
- etcd
30-
roles:
31-
- prepare
32-
33-
# [optional] only needed by multi-master cluster
34-
- hosts: lb
35-
roles:
36-
- lb
37-
38-
# to install etcd cluster
39-
- hosts: etcd
40-
roles:
41-
- etcd
42-
43-
# to install docker
44-
- hosts:
45-
- kube-master
46-
- kube-node
47-
roles:
48-
- docker
49-
50-
# to set up 'kube-master' nodes
51-
- hosts:
52-
- kube-master
53-
roles:
54-
- kube-master
55-
- kube-node
56-
#
57-
tasks:
58-
- name: Making master nodes SchedulingDisabled
59-
shell: "{{ bin_dir }}/kubectl cordon {{ inventory_hostname }} "
60-
when: DEPLOY_MODE != "allinone"
61-
ignore_errors: true
62-
63-
- name: Setting master role name
64-
shell: "{{ bin_dir }}/kubectl label node {{ inventory_hostname }} kubernetes.io/role=master --overwrite"
65-
ignore_errors: true
66-
67-
# to set up 'kube-node' nodes
68-
- hosts:
69-
- kube-node
70-
roles:
71-
- kube-node
72-
73-
# to restore data of etcd cluster
744
- hosts: etcd
755
roles:
766
- cluster-restore
77-

docs/op/cluster_restore.md

+20-103
Original file line numberDiff line numberDiff line change
@@ -3,127 +3,44 @@
33
虽然 K8S 集群可以配置成多主多节点的高可用的部署,还是有必要了解下集群的备份和容灾恢复能力;在高可用k8s集群中 etcd集群保存了整个集群的状态,因此这里的备份与恢复重点就是:
44

55
- 从运行的etcd集群备份数据到磁盘文件
6-
- 从etcd备份文件恢复数据到运行的etcd集群,然后据此重建整个集群
6+
- 从etcd备份文件恢复数据,从而使集群恢复到备份时状态
77

8-
## 前提
8+
## 备份与恢复操作说明
99

10-
k8s 集群可能因为软硬件故障或者误操作出现了不可自愈的问题,这个时候需要考虑集群从备份中恢复重建;使用kubeasz项目创建的集群如需恢复前提如下:
11-
12-
- 集群正常状态下的etcd 备份文件(etcd V3数据)
13-
- 创建集群时使用的 CA证书相关文件
14-
- 创建集群时使用的 ansible hosts文件
15-
16-
## 备份与恢复手动操作说明
17-
18-
首先用kubeasz 搭建一个测试集群,部署几个测试deployment,验证集群各项正常后,进行一次备份:
19-
20-
- 1.在一个etcd节点上执行数据备份,把产生的备份文件`snapshot.db`复制到所有etcd集群节点
10+
- 1.首先搭建一个测试集群,部署几个测试deployment,验证集群各项正常后,进行一次备份:
2111

2212
``` bash
23-
$ mkdir -p /backup/k8s/ && cd /backup/k8s
24-
$ ETCDCTL_API=3 etcdctl snapshot save snapshot.db
13+
$ ansible-playbook /etc/ansible/23.backup.yml
2514
```
2615

27-
- 2.在deploy节点把 CA证书相关备份出来
16+
执行完毕可以在备份目录下检查备份情况,示例如下:
2817

29-
``` bash
30-
$ mkdir -p /backup/k8s/ && cp /etc/kubernetes/ssl/ca* /backup/k8s/
3118
```
32-
33-
- 3.在deploy节点清理集群,模拟集群完全崩溃
34-
35-
``` bash
36-
$ ansible-playbook /etc/ansible/99.clean.yml
19+
/etc/ansible/.cluster/backup/
20+
├── hosts
21+
├── hosts-201907030954
22+
├── snapshot-201907030954.db
23+
├── snapshot-201907031048.db
24+
└── snapshot.db
3725
```
3826

39-
- 4.在deploy节点开始一步步重建集群
27+
- 2.模拟误删除操作(略)
4028

41-
``` bash
42-
# 恢复原集群的CA 证书相关
43-
$ mkdir -p /etc/kubernetes/ssl/ && cp /backup/k8s/* /etc/kubernetes/ssl/
29+
- 3.恢复集群及验证
4430

45-
# 然后执行集群恢复步骤,安装至 kube-node完成阶段
46-
$ cd /etc/ansible
47-
$ ansible-playbook 01.prepare.yml
48-
$ ansible-playbook 02.etcd.yml
49-
$ ansible-playbook 03.docker.yml
50-
$ ansible-playbook 04.kube-master.yml
51-
$ ansible-playbook 05.kube-node.yml
52-
53-
# 以上步骤验证正常后,停止etcd集群服务,并清空新etcd集群数据目录
54-
$ ansible etcd -m service -a 'name=etcd state=stopped'
55-
$ asnible etcd -m file -a 'name=/var/lib/etcd/member/ state=absent'
56-
```
57-
58-
- 5.手动分别登陆每个etcd节点进行数据备份恢复,每个etcd都要如下操作
59-
60-
``` bash
61-
# 参照本etcd节点/etc/systemd/system/etcd.service的服务文件,替换如下{{}}中变量后执行
62-
$ cd /backup/k8s/
63-
$ ETCDCTL_API=3 etcdctl snapshot restore snapshot.db \
64-
--name {{ NODE_NAME }} \
65-
--initial-cluster {{ ETCD_NODES }} \
66-
--initial-cluster-token etcd-cluster-0 \
67-
--initial-advertise-peer-urls https://{{ inventory_hostname }}:2380
68-
69-
# 以上执行完后,会生成{{ NODE_NAME }}.etcd的文件夹,将它里面的member 拷贝到etcd数据目录中
70-
$ cp -r {{ NODE_NAME }}.etcd/member /var/lib/etcd/
71-
72-
$ systemctl restart etcd
73-
```
74-
75-
- 6.在deploy节点执行网络重建
31+
可以在 `roles/cluster-restore/defaults/main.yml` 文件中配置需要恢复的 etcd备份版本(从上述备份目录中选取),默认使用最近一次备份;执行恢复后,需要一定时间等待 pod/svc 等资源恢复重建。
7632

7733
``` bash
78-
$ ansible-playbook /etc/ansible/tools/change_k8s_network.yml
79-
```
80-
81-
执行完之后,可以验证整个集群是否恢复正常,之前的测试应用部署是否全部恢复。
82-
83-
- 参考:https://github.com/coreos/etcd/blob/master/Documentation/op-guide/recovery.md
84-
85-
## 备份恢复自动脚本操作指南
86-
87-
- 一.集群备份
88-
89-
``` bash
90-
$ ansible-playbook /etc/ansible/23.backup.yml
91-
```
92-
93-
执行完毕可以在目录 `/etc/ansible/roles/cluster-backup/files`下检查备份情况,示例如下:
94-
95-
``` bash
96-
roles/cluster-backup/files/
97-
├── ca # 集群CA 相关备份
98-
│   ├── ca-config.json
99-
│   ├── ca.csr
100-
│   ├── ca-csr.json
101-
│   ├── ca-key.pem
102-
│   └── ca.pem
103-
├── hosts # ansible hosts备份
104-
│   ├── hosts # 最近的备份
105-
│   └── hosts-201807231642
106-
├── readme.md
107-
└── snapshot # etcd 数据备份
108-
├── snapshot-201807231642.db
109-
└── snapshot.db # 最近的备份
34+
$ ansible-playbook /etc/ansible/24.restore.yml
11035
```
111-
112-
- 二.模拟集群故障
36+
如果集群主要组件(master/etcd/node)等出现不可恢复问题,可以尝试使用如下步骤 [清理]() --> [创建]() --> [恢复]()
11337

11438
``` bash
11539
$ ansible-playbook /etc/ansible/99.clean.yml
116-
```
117-
118-
**注意** 为了模拟集群彻底崩溃,这里清理整个集群;实际操作中,在有备份前提下,也建议彻底清理集群后再尝试去恢复
119-
120-
- 三.集群恢复
121-
122-
可以在 `roles/cluster-restore/defaults/main.yml` 文件中配置需要恢复的 etcd备份版本,默认使用最近一次备份
123-
124-
``` bash
40+
$ ansible-playbook /etc/ansible/90.setup.yml
12541
$ ansible-playbook /etc/ansible/24.restore.yml
126-
$ ansible-playbook /etc/ansible/tools/change_k8s_network.yml
12742
```
12843

129-
执行完成可以验证整个集群是否恢复如初!
44+
## 参考
45+
46+
- https://github.com/coreos/etcd/blob/master/Documentation/op-guide/recovery.md

roles/cluster-backup/files/readme.md

-3
This file was deleted.

roles/cluster-backup/tasks/main.yml

-14
This file was deleted.

roles/cluster-restore/defaults/main.yml

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# 指定需要恢复的 etcd 数据备份,默认使用最近的一次备份
2+
# 在ansible 控制端查看备份目录:/etc/ansible/.cluster/backup
23
db_to_restore: "snapshot.db"
34

45
# etcd 集群间通信的IP和端口, 根据etcd组成员自动生成

roles/cluster-restore/tasks/main.yml

+7-7
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,26 @@
55
file: name=/var/lib/etcd/member state=absent
66

77
- name: 生成备份目录
8-
file: name=/backup/k8s state=directory
8+
file: name=/etcd_backup state=directory
99

1010
- name: 准备指定的备份etcd 数据
1111
copy:
12-
src: "{{ base_dir }}/roles/cluster-backup/files/snapshot/{{ db_to_restore }}"
13-
dest: "/backup/k8s/snapshot.db"
12+
src: "{{ base_dir }}/.cluster/backup/{{ db_to_restore }}"
13+
dest: "/etcd_backup/snapshot.db"
1414

15-
- name: 清理原备份出来数据
16-
file: name=/backup/k8s/{{ NODE_NAME }}.etcd state=absent
15+
- name: 清理上次备份恢复数据
16+
file: name=/etcd_backup/{{ NODE_NAME }}.etcd state=absent
1717

1818
- name: etcd 数据恢复
19-
shell: "cd /backup/k8s && \
19+
shell: "cd /etcd_backup && \
2020
ETCDCTL_API=3 {{ bin_dir }}/etcdctl snapshot restore snapshot.db \
2121
--name {{ NODE_NAME }} \
2222
--initial-cluster {{ ETCD_NODES }} \
2323
--initial-cluster-token etcd-cluster-0 \
2424
--initial-advertise-peer-urls https://{{ inventory_hostname }}:2380"
2525

2626
- name: 恢复数据至etcd 数据目录
27-
shell: "cp -rf /backup/k8s/{{ NODE_NAME }}.etcd/member /var/lib/etcd/"
27+
shell: "cp -rf /etcd_backup/{{ NODE_NAME }}.etcd/member /var/lib/etcd/"
2828

2929
- name: 重启etcd 服务
3030
service: name=etcd state=restarted

roles/deploy/tasks/main.yml

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
file: name={{ item }} state=directory
33
with_items:
44
- "{{ base_dir }}/.cluster/ssl"
5+
- "{{ base_dir }}/.cluster/backup"
56

67
- name: 本地设置 bin 目录权限
78
file: path={{ base_dir }}/bin state=directory mode=0755 recurse=yes

tools/easzctl

+5-5
Original file line numberDiff line numberDiff line change
@@ -307,17 +307,17 @@ function list() {
307307
[ -f "$BASEPATH/.cluster/current_cluster" ] || { echo "[ERROR] invalid context, run 'easzctl checkout <cluster_name>' first"; return 1; }
308308
CLUSTER=$(cat $BASEPATH/.cluster/current_cluster)
309309
echo -e "\nlist of managed contexts (current: \033[33m$CLUSTER\033[0m)"
310-
i=1; for Cluster in $(ls $BASEPATH/.cluster/ |grep -Ev "ssl|current_cluster|kubeconfig");
310+
i=1; for c in $(ls $BASEPATH/.cluster/ |grep -Ev "backup|ssl|current_cluster|kubeconfig");
311311
do
312-
echo -e "==> context $i:\t$Cluster"
312+
echo -e "==> context $i:\t$c"
313313
let "i++"
314314
done
315315
echo -e "\nlist of installed clusters (current: \033[33m$CLUSTER\033[0m)"
316-
i=1; for Cluster in $(ls $BASEPATH/.cluster/ |grep -Ev "ssl|current_cluster|kubeconfig");
316+
i=1; for c in $(ls $BASEPATH/.cluster/ |grep -Ev "backup|ssl|current_cluster|kubeconfig");
317317
do
318-
KUBECONF=$BASEPATH/.cluster/$Cluster/config
318+
KUBECONF=$BASEPATH/.cluster/$c/config
319319
if [ -f "$KUBECONF" ]; then
320-
echo -e "==> cluster $i:\t$Cluster"
320+
echo -e "==> cluster $i:\t$c"
321321
$BASEPATH/bin/kubectl --kubeconfig=$KUBECONF get node
322322
fi
323323
let "i++"

0 commit comments

Comments
 (0)