I am trying to train a neural net with pytorch lightning on ray on a databricks cluster. As a start, I copied the example from https://docs.ray.io/en/latest/train/getting-started-pytorch-lightning.html. Unfortunately, the training stops after one epoch with a RayTaskError.
Is anything wrong with the logging location? Might it be a ray bug? Or compability problems between python, ray and lightning? I am a little lost here as I don't understand the error message. Has anyone of you encountered anything similar?
I set up the ray cluster as follows:
from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster
eps = 1
ray_worker_memory = 410
remote_conn_str = setup_ray_cluster(
max_worker_nodes=8,
min_worker_nodes=1,
memory_worker_node=ray_worker_memory*0.7*10**9-eps,
object_store_memory_worker_node=ray_worker_memory*0.3*10**9-eps,
collect_log_to_path="/dbfs/test/ray_collected_logs"
)
I took the code from the tutorial mentioned above:
import os
import tempfile
import torch
from torch.utils.data import DataLoader
from torchvision.models import resnet18
from torchvision.datasets import FashionMNIST
from torchvision.transforms import ToTensor, Normalize, Compose
import lightning.pytorch as pl
import ray.train.lightning
from ray.train.torch import TorchTrainer
# Model, Loss, Optimizer
class ImageClassifier(pl.LightningModule):
def __init__(self):
super(ImageClassifier, self).__init__()
self.model = resnet18(num_classes=10)
self.model.conv1 = torch.nn.Conv2d(
1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
)
self.criterion = torch.nn.CrossEntropyLoss()
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx):
x, y = batch
outputs = self.forward(x)
loss = self.criterion(outputs, y)
self.log("loss", loss, on_step=True, prog_bar=True)
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.model.parameters(), lr=0.001)
def train_func():
# Data
transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])
data_dir = os.path.join(tempfile.gettempdir(), "data")
train_data = FashionMNIST(root=data_dir, train=True, download=True, transform=transform)
train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)
# Training
model = ImageClassifier()
# [1] Configure PyTorch Lightning Trainer.
trainer = pl.Trainer(
max_epochs=10,
devices="auto",
accelerator="auto",
strategy=ray.train.lightning.RayDDPStrategy(),
plugins=[ray.train.lightning.RayLightningEnvironment()],
callbacks=[ray.train.lightning.RayTrainReportCallback()],
# [1a] Optionally, disable the default checkpointing behavior
# in favor of the `RayTrainReportCallback` above.
enable_checkpointing=False,
)
trainer = ray.train.lightning.prepare_trainer(trainer)
trainer.fit(model, train_dataloaders=train_dataloader)
# [2] Configure scaling and resource requirements.
scaling_config = ray.train.ScalingConfig(num_workers=2, use_gpu=False)
# [3] Launch distributed training job.
trainer = TorchTrainer(
train_func,
scaling_config=scaling_config,
run_config=ray.train.RunConfig(
storage_path="/dbfs/test/ray_collected_logs",
name="lightning",
),
)
result: ray.train.Result = trainer.fit()
# [4] Load the trained model.
with result.checkpoint.as_directory() as checkpoint_dir:
model = ImageClassifier.load_from_checkpoint(
os.path.join(
checkpoint_dir,
ray.train.lightning.RayTrainReportCallback.CHECKPOINT_NAME,
),
)
After one epoch, I got the error:
RayTaskError(OSError): ray::_Inner.train() (pid=8556, ip=10.139.64.13, actor_id=84c7d46bc785ca0be4760e0d02000000, repr=TorchTrainer)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/ray/tune/trainable/trainable.py", line 331, in train
raise skipped from exception_cause(skipped)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/ray/train/_internal/utils.py", line 53, in check_for_failure
ray.get(object_ref)
^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ray.exceptions.RayTaskError(OSError): ray::_RayTrainWorker__execute.get_next() (pid=8728, ip=10.139.64.13, actor_id=cb307feab981f85ea2fea9bd02000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x7f63f35fb890>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/ray/train/_internal/worker_group.py", line 33, in __execute
raise skipped from exception_cause(skipped)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/ray/train/_internal/utils.py", line 169, in discard_return_wrapper
train_func(*args, **kwargs)
File "/root/.ipykernel/3049/command-3520572809263628-1931200599", line 60, in train_func
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 543, in fit
call._call_and_handle_interrupt(
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py", line 43, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 105, in launch
return function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 579, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 986, in _run
results = self._run_stage()
^^^^^^^^^^^^^^^^^
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 1030, in _run_stage
self.fit_loop.run()
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py", line 206, in run
self.on_advance_end()
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py", line 376, in on_advance_end
call._call_callback_hooks(trainer, "on_train_epoch_end", monitoring_callbacks=False)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py", line 210, in _call_callback_hooks
fn(trainer, trainer.lightning_module, *args, **kwargs)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/ray/train/lightning/_lightning_utils.py", line 289, in on_train_epoch_end
train.report(metrics=metrics, checkpoint=checkpoint)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/ray/train/_internal/session.py", line 657, in wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/ray/train/_internal/session.py", line 748, in report
_get_session().report(metrics, checkpoint=checkpoint)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/ray/train/_internal/session.py", line 426, in report
persisted_checkpoint = self.storage.persist_current_checkpoint(checkpoint)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/ray/train/_internal/storage.py", line 542, in persist_current_checkpoint
_pyarrow_fs_copy_files(
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-94c29ff9-2f5f-4216-b6c2-096d629bb4ad/lib/python3.11/site-packages/ray/train/_internal/storage.py", line 110, in _pyarrow_fs_copy_files
return pyarrow.fs.copy_files(
^^^^^^^^^^^^^^^^^^^^^^
File "/databricks/python/lib/python3.11/site-packages/pyarrow/fs.py", line 269, in copy_files
_copy_files_selector(source_fs, source_sel,
File "pyarrow/_fs.pyx", line 1627, in pyarrow._fs._copy_files_selector
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
OSError: [Errno 22] Error writing bytes to file. Detail: [errno 22] Invalid argument
The package versions (of the probably crucial packages) are: pytorch-lightning 2.3.3 ray 2.32.0
I am running the code on a databricks cluster with Databricks Runtime Version 15.2 ML (includes Apache Spark 3.5.0, Scala 2.12). The python version in this DBR is 3.11.0rc1.
Using an older DBR (14.3 ML) which comes with python 3.10.12 leads to the same error.
To check whether the path is a valid directory, I executed this:
dbutils.fs.mkdirs("/dbfs/test/ray_collected_logs/test_dir")
dbutils.fs.put("/dbfs/test/ray_collected_logs/test_file.txt", "This is a test.")
and the test worked. The problem above remains unsolved.