opencv/modules/ocl/src/sort_by_key.cpp at 2.4 · pythonwebcoder/opencv

History

472 lines (417 loc) · 17 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

/*M///////////////////////////////////////////////////////////////////////////////////////

// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.

// By downloading, copying, installing or using the software you agree to this license.

// If you do not agree to this license, do not download, install,

// copy or use the software.

// License Agreement

// For Open Source Computer Vision Library

// Third party copyrights are property of their respective owners.

// @Authors

// Peng Xiao, pengxiao@outlook.com

// Redistribution and use in source and binary forms, with or without modification,

// are permitted provided that the following conditions are met:

// * Redistribution's of source code must retain the above copyright notice,

// this list of conditions and the following disclaimer.

// * Redistribution's in binary form must reproduce the above copyright notice,

// this list of conditions and the following disclaimer in the documentation

// and/or other materials provided with the distribution.

// * The name of the copyright holders may not be used to endorse or promote products

// derived from this software without specific prior written permission.

// This software is provided by the copyright holders and contributors as is and

// any express or implied warranties, including, but not limited to, the implied

// warranties of merchantability and fitness for a particular purpose are disclaimed.

// In no event shall the Intel Corporation or contributors be liable for any direct,

// indirect, incidental, special, exemplary, or consequential damages

// (including, but not limited to, procurement of substitute goods or services;

// loss of use, data, or profits; or business interruption) however caused

// and on any theory of liability, whether in contract, strict liability,

// or tort (including negligence or otherwise) arising in any way out of

// the use of this software, even if advised of the possibility of such damage.

//M*/

#include "precomp.hpp"

#include "opencl_kernels.hpp"

using namespace cv;

using namespace cv::ocl;

namespace cv

{

namespace ocl

{

void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan);

#ifndef ANDROID

//TODO(pengx17): change this value depending on device other than a constant

const static unsigned int GROUP_SIZE = 256;

#endif

const char * depth_strings[] =

{

"uchar", //CV_8U

"char", //CV_8S

"ushort", //CV_16U

"short", //CV_16S

"int", //CV_32S

"float", //CV_32F

"double" //CV_64F

};

void static genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf)

{

sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s",

isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]);

if(vals.oclchannels() > 1)

{

sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels());

}

inline bool isSizePowerOf2(size_t size)

{

return ((size - 1) & (size)) == 0;

}

namespace bitonic_sort

{

static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)

{

CV_Assert(isSizePowerOf2(vecSize));

Context * cxt = Context::getContext();

size_t globalThreads[3] = {vecSize / 2, 1, 1};

// 2^numStages should be equal to vecSize or the output is invalid

int numStages = 0;

for(int i = vecSize; i > 1; i >>= 1)

{

++numStages;

}

char build_opt_buf [100];

genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);

const int argc = 5;

std::vector< std::pair<size_t, const void *> > args(argc);

String kernelname = "bitonicSort";

args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);

args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);

args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize);

for(int stage = 0; stage < numStages; ++stage)

{

args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);

for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)

{

args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);

#ifdef ANDROID

openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, NULL, args, -1, -1, build_opt_buf);

#else

size_t localThreads[3] = {GROUP_SIZE, 1, 1};

openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);

#endif

}

} /* bitonic_sort */

namespace selection_sort

{

// FIXME:

// This function cannot sort arrays with duplicated keys

static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)

{

CV_Error(-1, "This function is incorrect at the moment.");

Context * cxt = Context::getContext();

size_t globalThreads[3] = {vecSize, 1, 1};

std::vector< std::pair<size_t, const void *> > args;

char build_opt_buf [100];

genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);

//local

String kernelname = "selectionSortLocal";

#ifdef ANDROID

int lds_size = cxt->getDeviceInfo().maxWorkGroupSize * keys.elemSize();

#else

int lds_size = GROUP_SIZE * keys.elemSize();

#endif

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));

args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));

args.push_back(std::make_pair(lds_size, (void*)NULL));

#ifdef ANDROID

openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, NULL, args, -1, -1, build_opt_buf);

#else

size_t localThreads[3] = {GROUP_SIZE, 1, 1};

openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);

#endif

//final

kernelname = "selectionSortFinal";

args.pop_back();

#ifdef ANDROID

openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, NULL, args, -1, -1, build_opt_buf);

#else

openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);

#endif

}

} /* selection_sort */

namespace radix_sort

{

//FIXME(pengx17):

// exclusive scan, need to be optimized as this is too naive...

//void naive_scan_addition(oclMat& input, oclMat& output)

//{

// Context * cxt = Context::getContext();

// size_t vecSize = input.cols;

// size_t globalThreads[3] = {1, 1, 1};

// size_t localThreads[3] = {1, 1, 1};

// String kernelname = "naiveScanAddition";

// std::vector< std::pair<size_t, const void *> > args;

// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));

// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));

// args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));

// openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1);

//}

void static naive_scan_addition_cpu(oclMat& input, oclMat& output)

{

Mat m_input = input, m_output(output.size(), output.type());

MatIterator_<int> i_mit = m_input.begin<int>();

MatIterator_<int> o_mit = m_output.begin<int>();

*o_mit = 0;

++i_mit;

++o_mit;

for(; i_mit != m_input.end<int>(); ++i_mit, ++o_mit)

{

*o_mit = *(o_mit - 1) + *(i_mit - 1);

}

output = m_output;

}

//radix sort ported from Bolt

static void sortByKey(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan)

{

CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes

bool isKeyFloat = keys.type() == CV_32F;

const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a

//local array of 16 elements in the histogram kernel.

const int RADICES = (1 << RADIX); //Values handeled by each work-item?

bool newBuffer = false;

size_t vecSize = origVecSize;

unsigned int groupSize = RADICES;

size_t mulFactor = groupSize * RADICES;

oclMat buffer_keys, buffer_vals;

if(origVecSize % mulFactor != 0)

{

vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor;

buffer_keys.create(1, vecSize, keys.type());

buffer_vals.create(1, vecSize, vals.type());

Scalar padding_value;

oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1));

if(isGreaterThan)

{

switch(buffer_keys.depth())

{

case CV_32F:

padding_value = Scalar::all(-FLT_MAX);

break;

case CV_32S:

padding_value = Scalar::all(INT_MIN);

break;

}

else

{

switch(buffer_keys.depth())

{

case CV_32F:

padding_value = Scalar::all(FLT_MAX);

break;

case CV_32S:

padding_value = Scalar::all(INT_MAX);

break;

}

ocl::copyMakeBorder(

keys(Rect(0,0,origVecSize,1)), buffer_keys,

0, 0, 0, vecSize - origVecSize,

BORDER_CONSTANT, padding_value);

vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals);

newBuffer = true;

}

else

{

buffer_keys = keys;

buffer_vals = vals;

newBuffer = false;

}

oclMat swap_input_keys(1, vecSize, keys.type());

oclMat swap_input_vals(1, vecSize, vals.type());

oclMat hist_bin_keys(1, vecSize, CV_32SC1);

oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1);

Context * cxt = Context::getContext();

size_t globalThreads[3] = {vecSize / RADICES, 1, 1};

size_t localThreads[3] = {groupSize, 1, 1};

std::vector< std::pair<size_t, const void *> > args;

char build_opt_buf [100];

genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);

//additional build option for radix sort

sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT");

String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")};

int swap = 0;

for(int bits = 0; bits < (static_cast<int>(keys.elemSize()) * 8); bits += RADIX)

{

args.clear();

//Do a histogram pass locally

if(swap == 0)

{

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));

}

else

{

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));

}

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data));

args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));

openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads,

args, -1, -1, build_opt_buf);

args.clear();

//Perform a global scan

naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys);

// end of scan

if(swap == 0)

{

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));

}

else

{

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));

}

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data));

args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));

if(swap == 0)

{

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));

}

else

{

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));

}

openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads,

args, -1, -1, build_opt_buf);

swap = swap ? 0 : 1;

}

if(newBuffer)

{

buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys);

buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals);

}

} /* radix_sort */

namespace merge_sort

{

static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)

{

Context * cxt = Context::getContext();

const size_t groupSize = cxt->getDeviceInfo().maxWorkGroupSize >= 256 ? 256: 128;

size_t globalThreads[3] = {vecSize, 1, 1};

size_t localThreads[3] = {groupSize, 1, 1};

std::vector< std::pair<size_t, const void *> > args;

char build_opt_buf [100];

genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);

String kernelname[] = {String("blockInsertionSort"), String("merge")};

int keylds_size = groupSize * keys.elemSize();

int vallds_size = groupSize * vals.elemSize();

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));

args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));

args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize));

args.push_back(std::make_pair(keylds_size, (void*)NULL));

args.push_back(std::make_pair(vallds_size, (void*)NULL));

openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf);

// Early exit for the case of no merge passes, values are already in destination vector

if(vecSize <= groupSize)

{

return;

}

// An odd number of elements requires an extra merge pass to sort

size_t numMerges = 0;

// Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64

// this is how many merge passes we want

size_t log2BlockSize = vecSize >> 6;

for( ; log2BlockSize > 1; log2BlockSize >>= 1 )

{

++numMerges;

}

// Check to see if the input vector size is a power of 2, if not we will need last merge pass

numMerges += isSizePowerOf2(vecSize)? 1: 0;

// Allocate a flipflop buffer because the merge passes are out of place

oclMat tmpKeyBuffer(keys.size(), keys.type());

oclMat tmpValBuffer(vals.size(), vals.type());

args.resize(8);

args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize);

args[6] = std::make_pair(keylds_size, (void*)NULL);

args[7] = std::make_pair(vallds_size, (void*)NULL);

for(size_t pass = 1; pass <= numMerges; ++pass )

{

// For each pass, flip the input-output buffers

if( pass & 0x1 )

{

args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);

args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);

args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);

args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);

}

else

{

args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);

args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);

args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);

args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);

}

// For each pass, the merge window doubles

unsigned int srcLogicalBlockSize = static_cast<unsigned int>( localThreads[0] << (pass-1) );

args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize);

openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf);

}

// If there are an odd number of merges, then the output data is sitting in the temp buffer. We need to copy

// the results back into the input array

if( numMerges & 1 )

{

tmpKeyBuffer.copyTo(keys);

tmpValBuffer.copyTo(vals);

}

} /* merge_sort */

}

} /* namespace cv { namespace ocl */

void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan)

{

CV_Assert( keys.rows == 1 ); // we only allow one dimensional input

CV_Assert( keys.channels() == 1 ); // we only allow one channel keys

CV_Assert( vecSize <= static_cast<size_t>(keys.cols) );

switch(method)

{

case SORT_BITONIC:

bitonic_sort::sortByKey(keys, vals, vecSize, isGreaterThan);

break;

case SORT_SELECTION:

selection_sort::sortByKey(keys, vals, vecSize, isGreaterThan);

break;

case SORT_RADIX:

radix_sort::sortByKey(keys, vals, vecSize, isGreaterThan);

break;

case SORT_MERGE:

merge_sort::sortByKey(keys, vals, vecSize, isGreaterThan);

break;

}

void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, int method, bool isGreaterThan)

{

CV_Assert( keys.size() == vals.size() );

CV_Assert( keys.rows == 1 ); // we only allow one dimensional input

size_t vecSize = static_cast<size_t>(keys.cols);

sortByKey(keys, vals, vecSize, method, isGreaterThan);

}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

sort_by_key.cpp

Latest commit

History

sort_by_key.cpp

File metadata and controls