Skip to content

Commit e5ce80a

Browse files
ARROW-4313: Add ingestion function 'ingest_benchmark_runs_with_context' and example
1 parent 72132cc commit e5ce80a

6 files changed

Lines changed: 409 additions & 25 deletions

File tree

dev/benchmarking/README.md

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ the following in the psql command-line interface:
127127
An example CSV file for bulk ingestion is in
128128
[examples/benchmark_run_example.csv](examples/benchmark_run_example.csv).
129129
The columns are listed in the same order as they are defined, to avoid having
130-
to explicitly name every colum in ingestion. The "id" column is left empty
130+
to explicitly name every column in ingestion. The "id" column is left empty
131131
and will be automatically assigned on insert.
132132

133133
To ingest the example CSV file from the command line,
@@ -225,7 +225,10 @@ for row in response.json()['data']['allEnvironmentViews']['edges']:
225225
- String variables `'have single quotes'`
226226
- Arrays `'{"have", "curly", "braces"}'::text[]` or `'{1, 2, 3}'::integer[]`
227227
- JSONb `'{"has":"this", "format":42}'::jsonb`
228-
- When comparing nullable values use `x IS DISTINCT FROM y` rather than `x = y`
228+
- Elements inserted using JSON-formatted strings can use standard
229+
JSON-formatted arrays (`[1, 2, 3]`) and do not have to use the above
230+
string formats.
231+
- When comparing nullable values use `x IS NOT DISTINCT FROM y` rather than `x = y`
229232
- An auto-generated summary of the [Data model][./data_model.rst].
230233

231234
## Data model documentation
@@ -234,15 +237,13 @@ To recreate the data model documentation,
234237
(1) install the [psql client][postgres-downloads]
235238
(sorry you need to download the whole thing),
236239
(2) start the docker container using `docker-compose up`,
237-
(3) and then type:
240+
(3) and then run these scripts:
238241

239242
```
243+
./make_dotfile.sh
240244
./make_data_model_rst.sh
241245
```
242246

243-
It will ask you for the password (it's whatever is in the ".env" file
244-
as `PG_PASS`, so "benchmark" by default), and then the document should be generated.
245-
246247
[pg-jsonb]: https://www.postgresql.org/docs/11/datatype-json.html#id-1.5.7.22.3
247248
[so-utf8]: https://stackoverflow.com/a/33557023
248249
[pg-charset]: https://www.postgresql.org/docs/9.3/multibyte.html#AEN34424
Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
/*
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
*/
19+
20+
21+
-------------------------- IMPORT HELPERS --------------------------
22+
-- Load from JSON (from https://stackoverflow.com/a/48396608)
23+
-- How to use it in the psql client:
24+
-- \set content `cat /examples/machine.json`
25+
-- select ingest_machine(:'content'::jsonb);
26+
-- INGEST_MACHINE_VIEW
27+
CREATE OR REPLACE FUNCTION public.ingest_machine_view(from_jsonb jsonb)
28+
RETURNS integer AS
29+
$$
30+
DECLARE
31+
result integer;
32+
BEGIN
33+
INSERT INTO public.machine_view
34+
SELECT * FROM jsonb_populate_record(null::public.machine_view, from_jsonb)
35+
RETURNING machine_id INTO result;
36+
RETURN result;
37+
END
38+
$$
39+
LANGUAGE plpgsql;
40+
COMMENT ON FUNCTION public.ingest_machine_view(jsonb) IS
41+
E'The argument is a JSON object. NOTE: key names must be entirely\n'
42+
'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n'
43+
'Example::\n\n'
44+
' {\n'
45+
' "mac_address": "0a:00:2d:01:02:03",\n'
46+
' "machine_name": "Yet-Another-Machine-Name",\n'
47+
' "memory_bytes": 8589934592,\n'
48+
' "cpu_actual_frequency_hz": 2300000000,\n'
49+
' "os_name": "OSX",\n'
50+
' "architecture_name": "x86_64",\n'
51+
' "kernel_name": "18.2.0",\n'
52+
' "cpu_model_name": "Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz",\n'
53+
' "cpu_core_count": 2,\n'
54+
' "cpu_thread_count": 4,\n'
55+
' "cpu_frequency_max_hz": 2300000000,\n'
56+
' "cpu_frequency_min_hz": 2300000000,\n'
57+
' "cpu_l1d_cache_bytes": 32768,\n'
58+
' "cpu_l1i_cache_bytes": 32768,\n'
59+
' "cpu_l2_cache_bytes": 262144,\n'
60+
' "cpu_l3_cache_bytes": 4194304,\n'
61+
' "machine_other_attributes": {"just": "an example"},\n'
62+
' "gpu_information": "",\n'
63+
' "gpu_part_number": "",\n'
64+
' "gpu_product_name": ""\n'
65+
' }\n\n'
66+
'To identify which columns in "machine_view" are required,\n'
67+
'please see the view documentation in :ref:`benchmark-data-model`.\n';
68+
69+
-- INGEST_BENCHMARK_VIEW
70+
CREATE OR REPLACE FUNCTION public.ingest_benchmark_view(from_jsonb jsonb)
71+
RETURNS setof integer AS
72+
$$
73+
BEGIN
74+
RETURN QUERY
75+
INSERT INTO public.benchmark_view
76+
SELECT * FROM jsonb_populate_recordset(
77+
null::public.benchmark_view
78+
, from_jsonb
79+
)
80+
RETURNING benchmark_id;
81+
END
82+
$$
83+
LANGUAGE plpgsql;
84+
COMMENT ON FUNCTION public.ingest_benchmark_view(jsonb) IS
85+
E'The argument is a JSON object. NOTE: key names must be entirely\n'
86+
'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n'
87+
'Example::\n\n'
88+
' [\n'
89+
' {\n'
90+
' "benchmark_name": "Benchmark 1",\n'
91+
' "parameter_names": ["arg0", "arg1", "arg2"],\n'
92+
' "benchmark_description": "First benchmark",\n'
93+
' "benchmark_type": "Time",\n'
94+
' "units": "miliseconds",\n'
95+
' "lessisbetter": true,\n'
96+
' "benchmark_version": "second version",\n'
97+
' "benchmark_language": "Python"\n'
98+
' },\n'
99+
' {\n'
100+
' "benchmark_name": "Benchmark 2",\n'
101+
' "parameter_names": ["arg0", "arg1"],\n'
102+
' "benchmark_description": "Description 2.",\n'
103+
' "benchmark_type": "Time",\n'
104+
' "units": "nanoseconds",\n'
105+
' "lessisbetter": true,\n'
106+
' "benchmark_version": "second version",\n'
107+
' "benchmark_language": "Python"\n'
108+
' }\n'
109+
' ]\n\n'
110+
'To identify which columns in "benchmark_view" are required,\n'
111+
'please see the view documentation in :ref:`benchmark-data-model`.\n';
112+
113+
-- INGEST_BENCHMARK_RUN_VIEW
114+
CREATE OR REPLACE FUNCTION public.ingest_benchmark_run_view(from_jsonb jsonb)
115+
RETURNS setof bigint AS
116+
$$
117+
BEGIN
118+
RETURN QUERY
119+
INSERT INTO public.benchmark_run_view
120+
SELECT * FROM
121+
jsonb_populate_recordset(null::public.benchmark_run_view, from_jsonb)
122+
RETURNING benchmark_run_id;
123+
END
124+
$$
125+
LANGUAGE plpgsql;
126+
COMMENT ON FUNCTION public.ingest_benchmark_run_view(jsonb) IS
127+
E'The argument is a JSON object. NOTE: key names must be entirely\n'
128+
'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n'
129+
'Example::\n\n'
130+
' [\n'
131+
' {\n'
132+
' "benchmark_name": "Benchmark 2",\n'
133+
' "benchmark_version": "version 0",\n'
134+
' "parameter_values": {"arg0": 100, "arg1": 5},\n'
135+
' "value": 2.5,\n'
136+
' "git_commit_timestamp": "2019-02-08 22:35:53 +0100",\n'
137+
' "git_hash": "324d3cf198444a",\n'
138+
' "val_min": 1,\n'
139+
' "val_q1": 2,\n'
140+
' "val_q3": 3,\n'
141+
' "val_max": 4,\n'
142+
' "std_dev": 1.41,\n'
143+
' "n_obs": 8,\n'
144+
' "run_timestamp": "2019-02-14 03:00:05 -0600",\n'
145+
' "mac_address": "08:00:2b:01:02:03",\n'
146+
' "benchmark_language": "Python",\n'
147+
' "language_implementation_version": "CPython 2.7",\n'
148+
' "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}\n'
149+
' },\n'
150+
' {\n'
151+
' "benchmark_name": "Benchmark 2",\n'
152+
' "benchmark_version": "version 0",\n'
153+
' "parameter_values": {"arg0": 1000, "arg1": 5},\n'
154+
' "value": 5,\n'
155+
' "git_commit_timestamp": "2019-02-08 22:35:53 +0100",\n'
156+
' "git_hash": "324d3cf198444a",\n'
157+
' "std_dev": 3.14,\n'
158+
' "n_obs": 8,\n'
159+
' "run_timestamp": "2019-02-14 03:00:10 -0600",\n'
160+
' "mac_address": "08:00:2b:01:02:03",\n'
161+
' "benchmark_language": "Python",\n'
162+
' "language_implementation_version": "CPython 2.7",\n'
163+
' "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}\n'
164+
' }\n'
165+
' ]\n'
166+
'To identify which columns in "benchmark_run_view" are required,\n'
167+
'please see the view documentation in :ref:`benchmark-data-model`.\n';
168+
169+
-- INGEST_BENCHMARK_RUNS_WITH_CONTEXT
170+
CREATE OR REPLACE FUNCTION public.ingest_benchmark_runs_with_context(from_jsonb jsonb)
171+
RETURNS setof bigint AS
172+
$$
173+
DECLARE
174+
context_jsonb jsonb;
175+
found_environment_id integer;
176+
found_machine_id integer;
177+
BEGIN
178+
SELECT from_jsonb -> 'context' INTO context_jsonb;
179+
180+
SELECT public.get_machine_id((context_jsonb ->> 'mac_address')::macaddr)
181+
INTO found_machine_id;
182+
183+
SELECT get_environment_id(
184+
(context_jsonb ->> 'benchmark_language')::citext
185+
, (context_jsonb ->> 'language_implementation_version')::citext
186+
, context_jsonb -> 'dependencies'
187+
) INTO found_environment_id;
188+
189+
RETURN QUERY
190+
WITH run_datum AS (
191+
SELECT *
192+
FROM jsonb_to_recordset(from_jsonb -> 'benchmarks')
193+
AS x(
194+
benchmark_name citext
195+
, parameter_values jsonb
196+
, value numeric
197+
, val_min numeric
198+
, val_q1 numeric
199+
, val_q3 numeric
200+
, val_max numeric
201+
, std_dev numeric
202+
, n_obs integer
203+
, run_timestamp timestamp (0) with time zone
204+
, run_metadata jsonb
205+
, run_notes text
206+
)
207+
), benchmark_name_and_id AS (
208+
SELECT
209+
key AS benchmark_name
210+
, public.get_benchmark_id(
211+
(context_jsonb ->> 'benchmark_language')::citext
212+
, key::citext -- benchmark_name
213+
, value::citext -- benchmark_version
214+
) AS benchmark_id
215+
FROM jsonb_each_text(from_jsonb -> 'benchmark_version')
216+
)
217+
INSERT INTO public.benchmark_run (
218+
benchmark_id
219+
-- run_datum
220+
, parameter_values
221+
, value
222+
, val_min
223+
, val_q1
224+
, val_q3
225+
, val_max
226+
, std_dev
227+
, n_obs
228+
, run_metadata
229+
, run_notes
230+
-- additional context information
231+
, git_commit_timestamp
232+
, git_hash
233+
, run_timestamp
234+
-- machine
235+
, machine_id
236+
-- environment
237+
, environment_id
238+
, language_implementation_version_id
239+
, benchmark_language_id
240+
)
241+
SELECT
242+
b.benchmark_id
243+
-- run_datum
244+
, run_datum.parameter_values
245+
, run_datum.value
246+
, run_datum.val_min
247+
, run_datum.val_q1
248+
, run_datum.val_q3
249+
, run_datum.val_max
250+
, run_datum.std_dev
251+
, run_datum.n_obs
252+
, run_datum.run_metadata
253+
, run_datum.run_notes
254+
-- additional context information
255+
, (context_jsonb ->> 'git_commit_timestamp')::timestamp (0) with time zone
256+
, context_jsonb ->> 'git_hash'
257+
, (context_jsonb ->> 'run_timestamp')::timestamp (0) with time zone
258+
-- machine
259+
, found_machine_id
260+
-- environment
261+
, e.environment_id
262+
, e.language_implementation_version_id
263+
, e.benchmark_language_id
264+
FROM run_datum
265+
JOIN public.environment AS e
266+
ON e.environment_id = found_environment_id
267+
JOIN benchmark_name_and_id AS b
268+
ON b.benchmark_name = run_datum.benchmark_name
269+
RETURNING benchmark_run_id;
270+
END
271+
$$
272+
LANGUAGE plpgsql;
273+
COMMENT ON FUNCTION public.ingest_benchmark_runs_with_context(jsonb) IS
274+
E'The argument is a JSON object. NOTE: key names must be entirely\n'
275+
'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n'
276+
'The object contains three key-value pairs::\n\n'
277+
' {"context": {\n'
278+
' "mac_address": "08:00:2b:01:02:03",\n'
279+
' "benchmark_language": "Python",\n'
280+
' "language_implementation_version": "CPython 3.6",\n'
281+
' "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"},\n'
282+
' "git_commit_timestamp": "2019-02-14 22:42:22 +0100",\n'
283+
' "git_hash": "123456789abcde",\n'
284+
' "run_timestamp": "2019-02-14 03:00:40 -0600",\n'
285+
' "extra stuff": "does not hurt anything and will not be added."\n'
286+
' },\n'
287+
' "benchmark_version": {\n'
288+
' "Benchmark Name 1": "Any string can be a version.",\n'
289+
' "Benchmark Name 2": "A git hash can be a version.",\n'
290+
' "An Unused Benchmark Name": "Will be ignored."\n'
291+
' },\n'
292+
' "benchmarks": [\n'
293+
' {\n'
294+
' "benchmark_name": "Benchmark Name 1",\n'
295+
' "parameter_values": {"argument1": 1, "argument2": "value2"},\n'
296+
' "value": 42,\n'
297+
' "val_min": 41.2,\n'
298+
' "val_q1": 41.5,\n'
299+
' "val_q3": 42.5,\n'
300+
' "val_max": 42.8,\n'
301+
' "std_dev": 0.5,\n'
302+
' "n_obs": 100,\n'
303+
' "run_metadata": {"any": "key-value pairs"},\n'
304+
' "run_notes": "Any relevant notes."\n'
305+
' },\n'
306+
' {\n'
307+
' "benchmark_name": "Benchmark Name 2",\n'
308+
' "parameter_values": {"not nullable": "Use {} if no params."},\n'
309+
' "value": 8,\n'
310+
' "std_dev": 1,\n'
311+
' "n_obs": 2,\n'
312+
' }\n'
313+
' ]\n'
314+
' }\n\n'
315+
'- The entry for "context" contains the machine, environment, and timestamp\n'
316+
' information common to all of the runs\n'
317+
'- The entry for "benchmark_version" maps benchmark\n'
318+
' names to their version strings. (Which can be a git hash,\n'
319+
' the entire code string, a number, or any other string of your choice.)\n'
320+
'- The entry for "benchmarks" is a list of benchmark run data\n'
321+
' for the given context and benchmark versions. The first example\n'
322+
' benchmark run entry contains all possible values, even\n'
323+
' nullable ones, and the second entry omits all nullable values.\n\n';

dev/benchmarking/examples/benchmark_result_example_data.csv

Lines changed: 0 additions & 6 deletions
This file was deleted.

0 commit comments

Comments
 (0)