PerturbDiff/src/data/data_module/data_module.py at main · DeepGraphLearning/PerturbDiff

History

301 lines (246 loc) · 10.7 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

"""Module `data/data_module.py`."""

import os

import sys

from glob import glob

import numpy as np

import h5py

from pathlib import Path

from typing import Literal, Set, Dict

import omegaconf

import pickle

import pytorch_lightning as pl

import torch

from torch.utils.data import DataLoader

from torch.utils.data import Dataset

import torch.distributed as dist

from tqdm import tqdm

from src.data.dataset.dataset_core import H5adSentenceDataset

from src.data.metadata_cache import GlobalH5MetadataCache

from src.common.utils import safe_decode_array

from src.data.sampler import DistributedCellSetFixPairingBatchSampler, CellSetBatchSampler

from src.data.split_strategy import split_cellxgene, split_pbmc, split_tahoe100m

from src.data.data_module.data_module_setup import (

build_new_all_dict,

build_new_pert_dict,

perturbation_pretraining_setup,

perturbation_pretraining_setup_dataset,

pretraining_setup,

pretraining_setup_dataset,

)

import tracemalloc

import gc

def build_train_dataloader(dm):

"""

Build the training dataloader for a datamodule instance.

:param dm: Datamodule instance.

:return: Training dataloader.

"""

if not hasattr(dm, "train_dataset"):

dm.setup_dataset()

if dist.is_initialized() and dm.data_args.use_fixed_pairing:

sampler = DistributedCellSetFixPairingBatchSampler(

dm.train_dataset,

shuffle=True,

drop_last=True,

seed=dm.seed,

)

else:

sampler = CellSetBatchSampler(

dm.train_dataset,

shuffle=True,

seed=dm.seed,

)

loader = DataLoader(

dm.train_dataset,

batch_size=dm.micro_batch_size,

collate_fn=dm.train_dataset.collate_fn,

sampler=sampler,

num_workers=dm.data_args.num_workers,

persistent_workers=dm.data_args.persistent_workers,

pin_memory=dm.data_args.pin_memory,

drop_last=True,

prefetch_factor=dm.data_args.prefetch_factor,

)

dm.py_logger.info(f"Finished loading training data: {len(dm.train_dataset)} samples")

return loader

def build_val_dataloaders(dm):

"""

Build validation/test dataloaders for all splits.

:param dm: Datamodule instance.

:return: List of dataloaders in split order.

"""

loaders = []

for split in dm.all_split_names:

assert hasattr(dm, f"{split}_dataset")

cur_dataset = getattr(dm, f"{split}_dataset")

collate_fn = cur_dataset.collate_fn

if dist.is_initialized() and dm.data_args.use_fixed_pairing:

sampler = DistributedCellSetFixPairingBatchSampler(

cur_dataset,

shuffle=False,

drop_last=True,

seed=dm.seed,

)

else:

sampler = CellSetBatchSampler(

cur_dataset,

shuffle=False,

seed=dm.seed,

)

loader = DataLoader(

cur_dataset,

batch_size=dm.micro_batch_size,

collate_fn=collate_fn,

sampler=sampler,

num_workers=dm.data_args.num_workers,

persistent_workers=dm.data_args.persistent_workers,

pin_memory=dm.data_args.pin_memory,

drop_last=False,

prefetch_factor=dm.data_args.prefetch_factor,

)

dm.py_logger.info(f"Finished loading {split} data: {len(cur_dataset)} samples")

loaders.append(loader)

return loaders

class PretrainingDataModule(pl.LightningDataModule):

#def __init__(self,

# seed: int,

# micro_batch_size: int,

# data_args,

# py_logger

#):

# super().__init__()

# self.seed = seed

# self.micro_batch_size = micro_batch_size

# self.data_args = data_args

# self.py_logger = py_logger

# self.all_split_names = ["holdout_celltype", "holdout_pert", "random_RNAseq", "random_Perturbseq"]

"""Pretrainingdatamodule implementation used by the PerturbDiff pipeline."""

def train_dataloader(self):

"""This will be run every epoch."""

return build_train_dataloader(self)

def val_dataloader(self):

"""Prepare all split sets for pretraining validation here"""

return build_val_dataloaders(self)

#@profile

def setup_dataset(self, stage=None):

"""Execute `setup_dataset` and return values used by downstream logic."""

return pretraining_setup_dataset(self, stage=stage)

#@profile

def setup(self, stage=None):

"""Execute `setup` and return values used by downstream logic."""

return pretraining_setup(self, stage=stage)

class PBMCPerturbationDataModule(PretrainingDataModule):

"""

Inherent from PretrainingDataModule for func:

- setup()

- setup_dataset()

- train_dataloader()

- val_dataloader()

"""

def __init__(self,

seed: int,

micro_batch_size: int,

data_args,

py_logger

"""Special method `__init__`."""

super(PretrainingDataModule, self).__init__()

self.seed = seed

self.micro_batch_size = micro_batch_size

self.data_args = data_args

self.py_logger = py_logger

self.all_split_names = ["validation", "test"]

def get_dataset_names_and_paths(self, cfg, data_name):

"""

Get dataset names and paths.

:param cfg: Runtime configuration object.

:param data_name: Input `data_name` value.

:return: Requested object(s) for downstream use.

"""

key_info = {

"pert_col": cfg.pert_col,

"rnaseq_batch_col": cfg.rnaseq_batch_col,

"perturbseq_batch_col": cfg.perturbseq_batch_col,

"cell_type_key": cfg.cell_type_key,

"cell_line_key": cfg.cell_line_key,

"control_pert": cfg.control_pert,

"holdout_celltype": cfg.holdout_celltype,

"holdout_batches": cfg.holdout_batches,

"holdout_pert": cfg.holdout_pert,

}

with open(cfg.selected_gene_file, "rb") as fin:

selected_genes = pickle.load(fin)

# it's a set, because it's merged from multiple datasets

if isinstance(selected_genes, set):

selected_genes = sorted(list(selected_genes))

if cfg.dataset_name is None:

file_list = glob(os.path.join(cfg.dataset_path, "*.h5ad"))

file_list = sorted(file_list)[::-1]

if "cellxgene" in cfg.dataset_path.lower():

data_type = "RNA-seq"

assert len(file_list) == 23

elif "tahoe100m" in cfg.dataset_path.lower():

data_type = "Perturb-seq"

assert len(file_list) == 14 # total 14 files for tahoe100m

else:

raise NotImplementedError

dataset_name_list = [os.path.splitext(os.path.basename(x))[0] for x in file_list]

return dataset_name_list, file_list, [data_type] * len(file_list), [key_info] * len(file_list), [selected_genes] * len(file_list)

else:

return [cfg.dataset_name], [cfg.dataset_path], ["Perturb-seq"], [key_info], [selected_genes]

def _proceed_data_split(self,

cache, target_type, key_info, holdout_setname=None, random_setname=None):

"""Execute `_proceed_data_split` and return values used by downstream logic."""

return self._proceed_data_split_pbmc(cache, target_type, key_info, holdout_setname, random_setname)

def _proceed_data_split_pbmc(self,

cache, target_type, key_info, holdout_setname=None, random_setname=None

"""Execute `_proceed_data_split_pbmc` and return values used by downstream logic."""

return split_pbmc(self, cache, target_type, key_info, holdout_setname, random_setname)

class Tahoe100mPerturbationDataModule(PBMCPerturbationDataModule):

"""Tahoe100Mperturbationdatamodule implementation used by the PerturbDiff pipeline."""

def _proceed_data_split(self,

cache, target_type, key_info, holdout_setname=None, random_setname=None):

"""Execute `_proceed_data_split` and return values used by downstream logic."""

return self._proceed_data_split_tahoe100m(cache, target_type, key_info, holdout_setname, random_setname)

def _proceed_data_split_tahoe100m(self,

cache, target_type, key_info, holdout_setname=None, random_setname=None

"""Execute `_proceed_data_split_tahoe100m` and return values used by downstream logic."""

return split_tahoe100m(self, cache, target_type, key_info, holdout_setname, random_setname)

class CellxGeneDataModule(Tahoe100mPerturbationDataModule):

"""Cellxgenedatamodule implementation used by the PerturbDiff pipeline."""

def _proceed_data_split_cellxgene(self,

cache, target_type, key_info, holdout_setname=None, random_setname=None

"""Execute `_proceed_data_split_cellxgene` and return values used by downstream logic."""

return split_cellxgene(self, cache, target_type, key_info, holdout_setname, random_setname)

class PerturbationPretrainingDataModule(CellxGeneDataModule):

"""Perturbationpretrainingdatamodule implementation used by the PerturbDiff pipeline."""

def get_dataset_names_and_paths(self, cfg):

"""Execute `get_dataset_names_and_paths` and return values used by downstream logic."""

assert self.data_args == cfg

assert isinstance(self.data_args.dataset_name, omegaconf.dictconfig.DictConfig)

dataset_name_list, file_list, data_type_list, key_info_list, selected_genes_list = [], [], [], [], []

for key in self.data_args.dataset_name:

ret = super().get_dataset_names_and_paths(self.data_args.dataset_name[key], self.data_args.data_name)

dataset_name_list.extend(ret[0])

file_list.extend(ret[1])

data_type_list.extend(ret[2])

key_info_list.extend(ret[3])

selected_genes_list.extend(ret[4])

return dataset_name_list, file_list, data_type_list, key_info_list, selected_genes_list

def setup_dataset(self, stage=None):

"""Execute `setup_dataset` and return values used by downstream logic."""

return perturbation_pretraining_setup_dataset(self, stage=stage)

def get_new_pert_dict(self):

"""Execute `get_new_pert_dict` and return values used by downstream logic."""

return build_new_pert_dict(self)

def get_new_all_dict(self,):

"""Execute `get_new_all_dict` and return values used by downstream logic."""

return build_new_all_dict(self)

def setup(self, stage=None):

"""Execute `setup` and return values used by downstream logic."""

return perturbation_pretraining_setup(self, stage=stage)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

data_module.py

Latest commit

History

data_module.py

File metadata and controls