TEAL/kernels/kernel_utils.py at main · FasterDecoding/TEAL

History

50 lines (44 loc) · 1.63 KB

Raw

import torch

from scipy.stats import gmean

def benchmark(fn, warmup=100, rep=200, quantiles=None, fast_flush=True):

# https://github.com/nox-410/tvm.tl/blob/tl/python/tvm/tl/utils.py#L144

fn()

torch.cuda.synchronize()

if fast_flush:

cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")

else:

cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")

start_event = torch.cuda.Event(enable_timing=True)

end_event = torch.cuda.Event(enable_timing=True)

start_event.record()

for _ in range(5):

cache.zero_()

fn()

end_event.record()

torch.cuda.synchronize()

estimate_ms = start_event.elapsed_time(end_event) / 5

# compute number of warmup and repeat

n_warmup = max(1, int(warmup / estimate_ms))

n_repeat = max(1, int(rep / estimate_ms))

start_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]

end_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]

# Warm-up

for _ in range(n_warmup):

fn()

# Benchmark

for i in range(n_repeat):

cache.zero_()

start_event[i].record()

fn()

end_event[i].record()

torch.cuda.synchronize()

times = torch.tensor(

[s.elapsed_time(e) for s, e in zip(start_event, end_event)], dtype=torch.float

)

times_list = [s.elapsed_time(e) for s, e in zip(start_event, end_event)]

if quantiles is not None:

ret = torch.quantile(times, torch.tensor(quantiles, dtype=torch.float)).tolist()

if len(ret) == 1:

ret = ret[0]

return ret

return gmean(times_list)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

kernel_utils.py

Latest commit

History

kernel_utils.py

File metadata and controls