Maximize WebGL2 usage without overloading it

Question

My web application does a very long computation and then presents the results. I'm using WebGL2 for the computation - drawing into an offscreen 2D texture. I can't simply do it in a single WegGL call - the computation would take too long and result in the "lost context" error. So I split the computation in rectangular parts that can each be drawn in short time.

The problem is scheduling these WebGL calls. If I do them too often, the browser might become unresponsive or take away my WebGL context. If I don't do them often enough, the computation will take longer than necessary. I understand that losing context once in a while is normal, I'm afraid of losing it systematically because I'm using the GPU too much.

The best I could think of is to have some work-to-sleep ratio and sleep for a fraction of the time I used for the computation. I think I can use WebGL2 Sync Objects to wait for the issued calls to complete and to roughly estimate how much time they took. Like this:

var workSleepRatio = 0.5; // some value
var waitPeriod = 5;
var sync;
var startTime;

function makeSomeWebglCalls() {
    startTime = performance.now();
    sync = gl.fenceSync(gl.SYNC_GPU_COMMANDS_COMPLETE, 0);
    for (<estimate how many rectangles we can do so as not to waste too much time on waiting>) {
        gl.drawArrays(); // draw next small rectangle
    }
    setTimeout(timerCb, waitPeriod);
}

function timerCb() {
    var status = gl.getSyncParameter(sync, gl.SYNC_STATUS);
    if (status != gl.SIGNALED) {
        setTimeout(timerCb, waitPeriod);
    } else {
        gl.deleteSync(sync);
        
        var workTime = performance.now() - startTime;
        setTimeout(makeSomeWebglCalls, Math.min(1000, workTime * workSleepRatio));
    }
}

makeSomeWebglCalls();

This approach is not very good and it has these problems:

Don't know what to set workSleepRatio to.
Wasted time between gpu work completion and my timer callback. Can't rely on gl.clientWaitSync because its timeout parameter is limited by zero in many browsers, even in a Web Worker thread.
However big I set the workSleepRatio, I still cannot be sure that the browser won't think that I'm doing too much and take away the WebGL context. Maybe the requestAnimationFrame can somehow be used to slow down when it's being throttled, but then the user cannot switch tabs while waiting for the computation to complete.
setTimeout might become throttled by the browser and sleep a lot longer then requested.

So, in short, I have these questions:

how can one utilize WebGL without overloading it but also without wasting time? Is this even possible?
If it's not possible, then are there better ways to deal with the problem?

Is this helpful?

user128511
– user128511

2020-08-24 14:09:55 +00:00
Commented Aug 24, 2020 at 14:09 — user128511
– user128511, Commented Aug 24, 2020 at 14:09

score 1 · Accepted Answer · 2020-08-25 05:58:51Z

You might be able to use the EXT_disjoint_timer_query_webgl2?

function main() {
  const gl = document.createElement('canvas').getContext('webgl2', {
    powerPreference: 'high-performance',
  });
  log(`powerPreference: ${gl.getContextAttributes().powerPreference}\n\n`);
  if (!gl) {
    log('need WebGL2');
    return;
  }
  const ext = gl.getExtension('EXT_disjoint_timer_query_webgl2');
  if (!ext) {
    log('need EXT_disjoint_timer_query_webgl2');
    return;
  }

  const vs = `#version 300 es
  in vec4 position;
  void main() {
    gl_Position = position;
  }
  `;

  const fs = `#version 300 es
  precision highp float;
  uniform sampler2D tex;
  out vec4 fragColor;
  void main() {
    const int across = 100;
    const int up = 100;
    vec2 size = vec2(textureSize(tex, 0));
    vec4 sum = vec4(0);
    for (int y = 0; y < up; ++y) {
      for (int x = 0; x < across; ++x) {
        vec2 start = gl_FragCoord.xy + vec2(x, y);
        vec2 uv = (mod(start, size) + 0.5) / size;
        uv = texture(tex, uv).xy;
        uv = texture(tex, uv).xy;
        uv = texture(tex, uv).xy;
        uv = texture(tex, uv).xy;
        uv = texture(tex, uv).xy;
        uv = texture(tex, uv).xy;
        uv = texture(tex, uv).xy;
        sum += texture(tex, uv);
      }
    }  
    fragColor = sum / float(across * up);
  }
  `;

  const programInfo = twgl.createProgramInfo(gl, [vs, fs]);
  const bufferInfo = twgl.primitives.createXYQuadBufferInfo(gl);

  const pixels = new Uint8Array(1024 * 1024 * 4);
  for (let i = 0; i < pixels.length; ++i) {
    pixels[i] = Math.random() * 256;
  }
  // creates a 1024x1024 RGBA texture.
  const tex = twgl.createTexture(gl, {src: pixels});

  gl.useProgram(programInfo.program);
  twgl.setBuffersAndAttributes(gl, programInfo, bufferInfo);

  const waitFrame = _ => new Promise(resolve => requestAnimationFrame(resolve));

  const widthHeightFromIndex = i => {
    const height = 2 ** (i / 2 | 0);
    const width = height * (i % 2 + 1);
    return { width, height };
  };

  async function getSizeThatRunsUnderLimit(gl, limitMs) {
    log('size        time in milliseconds');
    log('--------------------------------');
    for (let i = 0; i < 32; ++i) {
      const {width, height} = widthHeightFromIndex(i);
      const timeElapsedMs = await getTimeMsForSize(gl, width, height);
      const dims = `${width}x${height}`;
      log(`${dims.padEnd(11)} ${timeElapsedMs.toFixed(1).padStart(6)}`);
      if (timeElapsedMs > limitMs) {
        return widthHeightFromIndex(i - 1);
      }
    }
  }

  (async () => {
    const limit = 1000 / 20;
    const {width, height} = await getSizeThatRunsUnderLimit(gl, limit);
    log('--------------------------------');
    log(`use ${width}x${height}`);
  })();

  async function getTimeMsForSize(gl, width, height) {
    gl.canvas.width = width;
    gl.canvas.height = height;
    gl.viewport(0, 0, width, height);

    // prime the GPU/driver
    // this is voodoo but if I don't do this
    // all the numbers come out bad. Even with
    // this the first test seems to fail with
    // a large number intermittently
    gl.drawElements(gl.TRIANGLES, 6, gl.UNSIGNED_SHORT, 0);

    for (;;) {
      const query = gl.createQuery();
      gl.beginQuery(ext.TIME_ELAPSED_EXT, query);

      gl.drawElements(gl.TRIANGLES, 6, gl.UNSIGNED_SHORT, 0);

      gl.endQuery(ext.TIME_ELAPSED_EXT);
      gl.flush();

      for (;;) {
        await waitFrame();

        const available = gl.getQueryParameter(query, gl.QUERY_RESULT_AVAILABLE);
        if (available) {
          break;
        }
      }

      const disjoint = gl.getParameter(ext.GPU_DISJOINT_EXT);    
      if (!disjoint) {
        const timeElapsed = gl.getQueryParameter(query, gl.QUERY_RESULT); 
        gl.deleteQuery(query);
        return timeElapsed / (10 ** 6);  // return milliseconds
      }

      gl.deleteQuery(query);
    }
  }
}

main();

function log(...args) {
  const elem = document.createElement('pre');
  elem.textContent = args.join(' ');
  document.body.appendChild(elem);
}

pre { margin: 0; }

<script src="https://twgljs.org/dist/4.x/twgl-full.min.js"></script>

On my 2014 Macbook Pro Dual GPU (Intel/Nvidia), first off, even though I request high-performance Chrome gives me low-power meaning it's using the Intel integrated GPU.

The first timing on 1x1 pixels is often ~17ms intermittently and often but not always. I don't know how to fix that. I could keep timing until 1x1 pixels is some more reasonable number like time 5 times until it's < 1 ms and if never then fail?

powerPreference: low-power

size        time in milliseconds
--------------------------------
1x1           16.1
2x1            0.0
2x2            0.0
4x2            0.0
4x4            0.0
8x4            0.1
8x8            0.1
16x8           0.0
16x16          0.0
32x16          0.0
32x32          0.0
64x32         13.6
64x64         35.7
128x64        62.6
--------------------------------
use 64x64

Testing on a late 2018 Macbook Air with Intel Integrated GPU shows a similar issue except the first timing comes out even worse at 42ms.

size        time in milliseconds
--------------------------------
1x1           42.4
2x1            0.0
2x2            0.0
4x2            0.0
4x4            0.0
8x4            0.0
8x8            0.0
16x8           0.0
16x16          0.0
32x16          0.0
32x32          0.0
64x32          0.0
64x64         51.5
--------------------------------
use 64x32

Further, the timings are kind of bogus. Note on my 2014 MBP, 32x32 is 0ms and 64x32 is suddenly 13ms. I'd expect 32x32 to be 6.5ms. Same on the MBA above, everything is 0 and then suddenly 51ms !??!??

Running it on a Windows 10 desktop with Nvidia RTX 2070 everything seems more reasonable. The 1x1 timing is correct and the timings grow as expected.

powerPreference: low-power

size        time in milliseconds
--------------------------------
1x1            0.0
2x1            0.0
2x2            0.0
4x2            0.0
4x4            0.0
8x4            0.0
8x8            0.0
16x8           0.0
16x16          0.0
32x16          0.1
32x32          0.1
64x32          2.4
64x64          2.9
128x64         3.1
128x128        6.0
256x128       15.4
256x256       27.8
512x256       58.6
--------------------------------
use 256x256

Also, on all systems if I don't pre-draw each size before the timing it fails and all timings come out > 16ms. Adding the pre-draw seems to work but it's voodoo. I even tried pre-drawing just 1x1 pixel instead of width by height pixels as the pre-draw and that failed!?!?!?

Further, Firefox doesn't support EXT_disjoint_timer_query_webgl2 I believe that's because precision timing makes it possible to steal info from other processes. Chrome fixed this with site isolation but I'm guessing Firefox has yet to do that.

note: WebGL1 has EXT_disjoint_timer_query for similar functionality.

update: the issues on intel GPUs might be related to fuzzing the timing to avoid security issues? Intel GPUs use unified memory (meaning they share memory with the CPU). I don't know. The chrome security article mentions lowering precision on devices with unified memory.

I suppose even without the timing extensions you could try seeing if you can render in under 60hz by checking requestAnimationFrame timing. Unfortunately my experience there is also that it can be flaky. Anything could cause rAF to take more than 60fps. Maybe the user is running other apps. Maybe they are on a 30hz monitor. etc... Maybe averaging the timings over a certain number of frames or taking the lowest reading of multiple timings.

Thanks gman. I think I'll use the time from this extension if it's available. I also have an onboard Intel GPU any my results are all above 10ms. Also they look suspiciously close to multiples of 1/60 (16, 33, 50). Maybe it has to do with the browsers drawing rate. But in any case they are more precise than my preformance.now() measurements
I also get "low-power" in the snippet, but high when run from a local html file

Collectives™ on Stack Overflow

Maximize WebGL2 usage without overloading it

1 Answer 1

2 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

2 Comments

Your Answer

Sign up or log in

Post as a guest

Related