Skip to content
31 changes: 31 additions & 0 deletions bigframes/session/_io/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import datetime
import itertools
import os
import textwrap
import types
from typing import Dict, Iterable, Optional, Sequence, Tuple, Union
Expand All @@ -34,6 +35,8 @@
MAX_LABELS_COUNT = 64
TEMP_TABLE_PREFIX = "bqdf{date}_{random_id}"

LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"


def create_job_configs_labels(
job_configs_labels: Optional[Dict[str, str]],
Expand Down Expand Up @@ -243,4 +246,32 @@ def start_query_with_client(
)
else:
results_iterator = query_job.result(max_results=max_results)

if LOGGING_NAME_ENV_VAR in os.environ:
# when running notebooks via pytest nbmake
pytest_log_job(query_job)

return results_iterator, query_job


def pytest_log_job(query_job: bigquery.QueryJob):
"""For pytest runs only, log information about the query job
to a file in order to create a performance report.
"""
if LOGGING_NAME_ENV_VAR not in os.environ:
raise EnvironmentError(
"Environment variable {env_var} is not set".format(
env_var=LOGGING_NAME_ENV_VAR
)
)
test_name = os.environ[LOGGING_NAME_ENV_VAR]
current_directory = os.getcwd()
bytes_processed = query_job.total_bytes_processed
if not isinstance(bytes_processed, int):
return # filter out mocks
if query_job.configuration.dry_run:
# dry runs don't process their total_bytes_processed
bytes_processed = 0
bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed")
with open(bytes_file, "a") as f:
f.write(str(bytes_processed) + "\n")
58 changes: 49 additions & 9 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,8 @@ def notebook(session: nox.Session):
"--nbmake-timeout=900", # 15 minutes
]

logging_name_env_var = "BIGFRAMES_PERFORMANCE_LOG_NAME"

try:
# Populate notebook parameters and make a backup so that the notebooks
# are runnable.
Expand All @@ -773,13 +775,21 @@ def notebook(session: nox.Session):
*notebooks,
)

# Run self-contained notebooks in single session.run
# achieve parallelization via -n
session.run(
*pytest_command,
"-nauto",
*notebooks,
)
# Run notebooks in parallel session.run's, since each notebook
# takes an environment variable for performance logging
processes = []
for notebook in notebooks:
session.env[logging_name_env_var] = os.path.basename(notebook)
process = Process(
target=session.run,
args=(*pytest_command, notebook),
)
process.start()
processes.append(process)

for process in processes:
process.join()

finally:
# Prevent our notebook changes from getting checked in to git
# accidentally.
Expand All @@ -789,11 +799,12 @@ def notebook(session: nox.Session):
*notebooks,
)

# Run regionalized notebooks in parallel session.run's, since each notebook
# takes a different region via env param.
# Additionally run regionalized notebooks in parallel session.run's.
# Each notebook takes a different region via env param.
processes = []
for notebook, regions in notebooks_reg.items():
for region in regions:
session.env[logging_name_env_var] = os.path.basename(notebook)
process = Process(
target=session.run,
args=(*pytest_command, notebook),
Expand All @@ -805,6 +816,35 @@ def notebook(session: nox.Session):
for process in processes:
process.join()

# when run via pytest, notebooks output a .bytesprocessed report
# collect those reports and print a summary
_print_bytes_processed_report()


def _print_bytes_processed_report():
"""Add an informational report about http queries and bytes
processed to the testlog output for purposes of measuring
bigquery-related performance changes.
"""
print("---BIGQUERY USAGE REPORT---")
cumulative_queries = 0
cumulative_bytes = 0
for report in Path("notebooks/").glob("*/*.bytesprocessed"):
with open(report, "r") as f:
filename = report.stem
lines = f.read().splitlines()
query_count = len(lines)
total_bytes = sum([int(line) for line in lines])
format_string = f"{filename} - query count: {query_count}, bytes processed sum: {total_bytes}"
print(format_string)
cumulative_bytes += total_bytes
cumulative_queries += query_count
print(
"---total queries: {total_queries}, total bytes: {total_bytes}---".format(
total_queries=cumulative_queries, total_bytes=cumulative_bytes
)
)


@nox.session(python="3.10")
def release_dry_run(session):
Expand Down