docarray/docarray/document/generators.py at docarray-v1-fixes · docarray/docarray

History

326 lines (278 loc) · 11.8 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

import csv

import glob

import itertools

import json

import os

import random

import re

from contextlib import nullcontext

from typing import (

Optional,

Generator,

Union,

List,

Iterable,

Dict,

TYPE_CHECKING,

TextIO,

)

import numpy as np

if TYPE_CHECKING: # pragma: no cover

from docarray import Document

def from_ndarray(

array: 'np.ndarray',

axis: int = 0,

size: Optional[int] = None,

shuffle: bool = False,

*args,

**kwargs,

) -> Generator['Document', None, None]:

"""Create a generator for a given dimension of a numpy array.

:param array: the numpy ndarray data source

:param axis: iterate over that axis

:param size: the maximum number of the sub arrays

:param shuffle: shuffle the numpy data source beforehand

:yield: documents

"""

from docarray.document import Document

if shuffle:

# shuffle for random query

array = np.take(array, np.random.permutation(array.shape[0]), axis=axis)

d = 0

for r in array:

yield Document(content=r)

d += 1

if size is not None and d >= size:

break

def from_files(

patterns: Union[str, List[str]],

recursive: bool = True,

size: Optional[int] = None,

sampling_rate: Optional[float] = None,

read_mode: Optional[str] = None,

to_dataturi: bool = False,

exclude_regex: Optional[str] = None,

*args,

**kwargs,

) -> Generator['Document', None, None]:

"""Creates an iterator over a list of file path or the content of the files.

:param patterns: The pattern may contain simple shell-style wildcards, e.g. '\*.py', '[\*.zip, \*.gz]'

:param recursive: If recursive is true, the pattern '**' will match any files

and zero or more directories and subdirectories

:param size: the maximum number of the files

:param sampling_rate: the sampling rate between [0, 1]

:param read_mode: specifies the mode in which the file is opened.

'r' for reading in text mode, 'rb' for reading in binary mode.

If `read_mode` is None, will iterate over filenames.

:param to_dataturi: if set, then the Document.uri will be filled with DataURI instead of the plan URI

:param exclude_regex: if set, then filenames that match to this pattern are not included.

:yield: file paths or binary content

.. note::

This function should not be directly used, use :meth:`Flow.index_files`, :meth:`Flow.search_files` instead

"""

from docarray.document import Document

if read_mode not in {'r', 'rb', None}:

raise RuntimeError(f'read_mode should be "r", "rb" or None, got `{read_mode}`')

def _iter_file_exts(ps):

return itertools.chain.from_iterable(

glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps

)

num_docs = 0

if isinstance(patterns, str):

patterns = [patterns]

_r = None

if exclude_regex:

try:

_r = re.compile(exclude_regex)

except re.error:

raise ValueError(f'`{exclude_regex}` is not a valid regex.')

for g in _iter_file_exts(patterns):

if os.path.isdir(g):

continue

if _r and _r.match(g):

continue

if sampling_rate is None or random.random() < sampling_rate:

if read_mode is None:

d = Document(uri=g)

if to_dataturi:

d.convert_uri_to_datauri()

yield d

elif read_mode in {'r', 'rb'}:

with open(g, read_mode) as fp:

d = Document(content=fp.read(), uri=g)

if to_dataturi:

d.convert_uri_to_datauri()

yield d

num_docs += 1

if size is not None and num_docs >= size:

break

def from_csv(

file: Union[str, TextIO],

field_resolver: Optional[Dict[str, str]] = None,

size: Optional[int] = None,

sampling_rate: Optional[float] = None,

dialect: Union[str, 'csv.Dialect'] = 'excel',

encoding: str = 'utf-8',

*args,

**kwargs,

) -> Generator['Document', None, None]:

"""Generator function for CSV. Yields documents.

:param file: file paths or file handler

:param field_resolver: a map from field names defined in JSON, dict to the field

names defined in Document.

:param size: the maximum number of the documents

:param sampling_rate: the sampling rate between [0, 1]

:param dialect: define a set of parameters specific to a particular CSV dialect. could be a string that represents

predefined dialects in your system, or could be a :class:`csv.Dialect` class that groups specific formatting

parameters together. If you don't know the dialect and the default one does not work for you,

you can try set it to ``auto``.

:param encoding: encoding used to read the CSV file. By default, ``utf-8`` is used.

:yield: documents

"""

from docarray.document import Document

if hasattr(file, 'read'):

file_ctx = nullcontext(file)

else:

file_ctx = open(file, 'r', encoding=encoding)

with file_ctx as fp:

# when set to auto, then sniff

try:

if isinstance(dialect, str) and dialect == 'auto':

dialect = csv.Sniffer().sniff(fp.read(1024))

fp.seek(0)

except:

dialect = 'excel' #: can not sniff delimiter, use default dialect

lines = csv.DictReader(fp, dialect=dialect)

for value in _subsample(lines, size, sampling_rate):

if 'groundtruth' in value and 'document' in value:

yield Document(

value['document'], field_resolver=field_resolver

), Document(value['groundtruth'], field_resolver=field_resolver)

else:

yield Document(value, field_resolver=field_resolver)

def from_huggingface_datasets(

dataset_path: str,

field_resolver: Optional[Dict[str, str]] = None,

size: Optional[int] = None,

sampling_rate: Optional[float] = None,

filter_fields: bool = False,

**datasets_kwargs,

) -> Generator['Document', None, None]:

"""Generator function for Hugging Face Datasets. Yields documents.

This function helps to load datasets from Hugging Face Datasets Hub

(https://huggingface.co/datasets) in Jina. Additional parameters can be

passed to the ``datasets`` library using keyword arguments. The ``load_dataset``

method from ``datasets`` library is used to load the datasets.

:param dataset_path: a valid dataset path for Hugging Face Datasets library.

:param field_resolver: a map from field names defined in ``document`` (JSON, dict) to the field

names defined in Protobuf. This is only used when the given ``document`` is

a JSON string or a Python dict.

:param size: the maximum number of the documents

:param sampling_rate: the sampling rate between [0, 1]

:param filter_fields: specifies whether to filter the dataset with the fields

given in ```field_resolver`` argument.

:param **datasets_kwargs: additional arguments for ``load_dataset`` method

from Datasets library. More details at

https://huggingface.co/docs/datasets/package_reference/loading_methods.html#datasets.load_dataset

:yield: documents

"""

from docarray.document import Document

import datasets

# Load the dataset using given arguments

data = datasets.load_dataset(dataset_path, **datasets_kwargs)

# Validate loaded dataset for splits

if isinstance(data, (datasets.DatasetDict, datasets.IterableDatasetDict)):

raise ValueError(

(

'Please provide a split for dataset using "split" argument. '

f'The following splits are available for this dataset: {list(data.keys())}'

)

# Filter dataset if needed

if filter_fields:

if not field_resolver:

raise ValueError(

'Filter fields option requires "field_resolver" to be provided.'

)

else:

data.set_format(type=None, columns=list(field_resolver.keys()))

# Return documents from dataset instances with subsampling if required

for value in _subsample(data, size, sampling_rate):

yield Document(value, field_resolver=field_resolver)

def from_ndjson(

fp: Iterable[str],

field_resolver: Optional[Dict[str, str]] = None,

size: Optional[int] = None,

sampling_rate: Optional[float] = None,

*args,

**kwargs,

) -> Generator['Document', None, None]:

"""Generator function for line separated JSON. Yields documents.

:param fp: file paths

:param field_resolver: a map from field names defined in ``document`` (JSON, dict) to the field

names defined in Protobuf. This is only used when the given ``document`` is

a JSON string or a Python dict.

:param size: the maximum number of the documents

:param sampling_rate: the sampling rate between [0, 1]

:yield: documents

"""

from docarray.document import Document

for line in _subsample(fp, size, sampling_rate):

value = json.loads(line)

if 'groundtruth' in value and 'document' in value:

yield Document(value['document'], field_resolver=field_resolver), Document(

value['groundtruth'], field_resolver=field_resolver

)

else:

yield Document(value, field_resolver=field_resolver)

def from_lines(

lines: Optional[Iterable[str]] = None,

filepath: Optional[str] = None,

read_mode: str = 'r',

line_format: str = 'json',

field_resolver: Optional[Dict[str, str]] = None,

size: Optional[int] = None,

sampling_rate: Optional[float] = None,

) -> Generator['Document', None, None]:

"""Generator function for lines, json and csv. Yields documents or strings.

:param lines: a list of strings, each is considered as a document

:param filepath: a text file that each line contains a document

:param read_mode: specifies the mode in which the file

is opened. 'r' for reading in text mode, 'rb' for reading in binary

:param line_format: the format of each line ``json`` or ``csv``

:param field_resolver: a map from field names defined in ``document`` (JSON, dict) to the field

names defined in Protobuf. This is only used when the given ``document`` is

a JSON string or a Python dict.

:param size: the maximum number of the documents

:param sampling_rate: the sampling rate between [0, 1]

:yield: documents

"""

if filepath:

file_type = os.path.splitext(filepath)[1]

with open(os.path.expanduser(filepath), read_mode) as f:

if file_type in _jsonl_ext:

yield from from_ndjson(f, field_resolver, size, sampling_rate)

elif file_type in _csv_ext:

yield from from_csv(f, field_resolver, size, sampling_rate)

else:

yield from _subsample(f, size, sampling_rate)

elif lines:

if line_format == 'json':

yield from from_ndjson(lines, field_resolver, size, sampling_rate)

elif line_format == 'csv':

yield from from_csv(lines, field_resolver, size, sampling_rate)

else:

yield from _subsample(lines, size, sampling_rate)

else:

raise ValueError('"filepath" and "lines" can not be both empty')

# https://github.com/ndjson/ndjson.github.io/issues/1#issuecomment-109935996

_jsonl_ext = {'.jsonlines', '.ndjson', '.jsonl', '.jl', '.ldjson'}

_csv_ext = {'.csv', '.tcsv'}

def _sample(iterable, sampling_rate: Optional[float] = None):

for i in iterable:

if sampling_rate is None or random.random() < sampling_rate:

yield i

def _subsample(

iterable, size: Optional[int] = None, sampling_rate: Optional[float] = None

yield from itertools.islice(_sample(iterable, sampling_rate), size)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

generators.py

Latest commit

History

generators.py

File metadata and controls