forked from PaddlePaddle/Paddle
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathps_instance.py
More file actions
160 lines (138 loc) · 4.38 KB
/
ps_instance.py
File metadata and controls
160 lines (138 loc) · 4.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from .helper import MPIHelper
class PaddlePSInstance(object):
"""
PaddlePSInstance class is used to generate A instance of server or worker
Args:
server_worker_mode: is a value 0 or 1, default is 1
proc_per_node: process per node, default is 2
Examples:
instance = PaddlePSInstance(1, 2)
"""
def __init__(self, server_worker_mode, proc_per_node):
self.dh = MPIHelper()
self._rankid = self.dh.get_rank()
self._server_worker_mode = server_worker_mode
self._proc_per_node = proc_per_node
self._nodes = self.dh.get_size()
self._ip = 0
self._worker_num = self._nodes * self._proc_per_node / 2
self._server_num = self._nodes * self._proc_per_node / 2
self._total_server_worker = self._worker_num + self._server_num
self._node_type = None #IDLE=-1, WORKER=1, SERVER=0
self._set_nodetype()
self._comm = None
self._split_comm()
def _set_nodetype(self):
if self._server_worker_mode == 0:
if self._rankid < self._server_num:
self._node_type = 1
elif self._rankid < self._total_server_worker:
self._node_type = 0
else:
self._node_type = -1
elif self._server_worker_mode == 1:
if self._rankid < self._total_server_worker:
if 0 == self._rankid % self._proc_per_node % 2:
self._node_type = 0
else:
self._node_type = 1
else:
self._node_type = -1
else:
self._node_type = -1
def _split_comm(self):
if self.is_server():
self._comm = self.dh.comm.Split(self._node_type)
elif self.is_worker():
self._comm = self.dh.comm.Split(self._node_type)
pass
def get_worker_index(self):
"""
Return worker index
"""
if self._server_worker_mode == 0:
return self._rankid == self.server_num
else:
return self._rankid / self._proc_per_node
def get_server_index(self):
"""
Return server index
"""
if self._server_worker_mode == 0:
return self.rank_id
else:
return self.rank_id / self._proc_per_node
def is_worker(self):
"""
Return instance is worker or not
"""
return self._node_type == 1
def is_server(self):
"""
Return instance is server or not
"""
return self._node_type == 0
def is_first_worker(self):
"""
Return instance is first worker or not
"""
return self.is_worker() and 0 == self.get_worker_index()
def set_ip(self, ip):
"""
set server ip
"""
self._ip = ip
def gather_ips(self):
"""
Return all servers and workers ip throught mpi allgather
"""
self._ips = self.dh.comm.allgather(self._ip)
return self._ips
def get_node_cnt(self):
"""
Return node cnt
"""
return self._nodes
def get_worker_num(self):
"""
Return worker num
"""
return self._worker_num
def get_server_num(self):
"""
Return server num
"""
return self._server_num
def barrier_all(self):
"""
barrier workers and servers
"""
self.dh.comm.barrier()
def barrier_worker(self):
"""
barrier workers
"""
if self.is_worker():
self._comm.barrier()
pass
def finalize(self):
"""
MPI finalize
"""
self.dh.finalize()
pass
if __name__ == "__main__":
instance = PaddlePSInstance(1, 1, 2, 50)
instance.barrier_all()