feat: add support for train on windows#37
feat: add support for train on windows#37Wang-zipeng wants to merge 5 commits intoFateScript:masterfrom Wang-zipeng:for_train_on_windows
Conversation
dl_lib/engine/defaults.py
Outdated
| # Therefore we use a deterministic way to obtain port, | ||
| # so that users are aware of orphan processes by seeing the port occupied. | ||
| port = 2 ** 15 + 2 ** 14 + hash(os.getuid()) % 2 ** 14 | ||
| port = 2 ** 15 + 2 ** 14 + hash("User_name") % 2 ** 14 |
There was a problem hiding this comment.
hash("User_name") is a fix value, please don't do that.
There was a problem hiding this comment.
It's a fixed value i know, but i think is impossible to train on 8-GPU windows machine, I will find a way to get uid on windows.
| cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||
|
|
||
| dim3 grid(std::min(at::cuda::ATenCeilDiv(output_size, 512L), 4096L)); | ||
| dim3 grid(std::min(ceil_div((int)output_size, 512), 4096)); |
There was a problem hiding this comment.
at::cuda::ATenCeilDiv works for all platform, the real reason for not working on windows is 'L'
There was a problem hiding this comment.
I will change it and try to recompile.
There was a problem hiding this comment.
If i remove "L", could this function run correctly on linux? Could i just simple "L"?
| cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||
|
|
||
| dim3 grid(std::min(at::cuda::ATenCeilDiv(grad.numel(), 512L), 4096L)); | ||
| dim3 grid(std::min(ceil_div((int)grad.numel(), 512), 4096)); |
| OPTIMIZER=dict( | ||
| NAME="SGD", | ||
| BASE_LR=0.02, | ||
| BASE_LR=0.002, |
There was a problem hiding this comment.
please do not change this, thanks.
There was a problem hiding this comment.
0.02 is too bigger for one GPU, i will change back it.
| @@ -0,0 +1,126 @@ | |||
| # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved | |||
There was a problem hiding this comment.
Such a file is duplicated with tools/train_net.py, or you should consider combine them together
There was a problem hiding this comment.
Ok, I will try use the same train way as on linux
i just search PTAL's mean by google. |
dl_lib/engine/defaults.py
Outdated
| # Therefore we use a deterministic way to obtain port, | ||
| # so that users are aware of orphan processes by seeing the port occupied. | ||
| port = 2 ** 15 + 2 ** 14 + hash(os.getuid()) % 2 ** 14 | ||
| port = 2 ** 15 + 2 ** 14 + hash(getuser()) % 2 ** 14 |
There was a problem hiding this comment.
| port = 2 ** 15 + 2 ** 14 + hash(getuser()) % 2 ** 14 | |
| port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14 |
| cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||
|
|
||
| dim3 grid(std::min(at::cuda::ATenCeilDiv(output_size, 512L), 4096L)); | ||
| dim3 grid(std::min(at::cuda::ATenCeilDiv(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096))); |
There was a problem hiding this comment.
It's better to break this long line of code.
| cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||
|
|
||
| dim3 grid(std::min(at::cuda::ATenCeilDiv(grad.numel(), 512L), 4096L)); | ||
| dim3 grid(std::min(at::cuda::ATenCeilDiv(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096))); |
setup.py
Outdated
| "-D__CUDA_NO_HALF_CONVERSIONS__", | ||
| "-D__CUDA_NO_HALF2_OPERATORS__", | ||
| ] | ||
| if "Windows" == os_name: |
There was a problem hiding this comment.
is sys.platform suitable for your case?
tools/train_net.py
Outdated
| if eval_space_Gb > free_space_Gb: | ||
| logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) " | ||
| f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}") | ||
| if "Linux" == platform.system(): |
There was a problem hiding this comment.
| if "Linux" == platform.system(): | |
| if sys.platform == "linux": |
FateScript
left a comment
There was a problem hiding this comment.
Remember that Python is not C++, code like
if a = 1is invalid.
Implement train on windows.
Compile steps(need visual studio 2017):
execute "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" in the windows cmd to establish a compile environment.
enter the code folder and use command "python setup.py develop"
Train steps:
Anothers: