Skip to content

Commit 2147aad

Browse files
authored
Merge pull request #395 from Yhg1s/synthesize_loops_script
Add synthesize_loops_file script
2 parents eeccea6 + 90fd6c1 commit 2147aad

File tree

3 files changed

+353
-0
lines changed

3 files changed

+353
-0
lines changed

bench_runner/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"remove_benchmark": "Remove specific benchmarks from the data set",
2020
"run_benchmarks": "Run benchmarks (in timing, pyperf or perf modes)",
2121
"should_run": "Determine whether we need to rerun results for the current commit",
22+
"synthesize_loops_file": "Create a loops file from multiple benchmark results",
2223
"notify": "Send a notification about the completion of the workflow",
2324
}
2425

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
import argparse
2+
import collections
3+
import errno
4+
import json
5+
import pathlib
6+
import sys
7+
from typing import Iterable
8+
9+
import rich_argparse
10+
11+
# pyperf/pyperformance run benchmarks by benchmark name, but store results,
12+
# including loops used, by data point name (as reported by the benchmark).
13+
# There's no mapping from the one to the other that we can easily use (other
14+
# than running benchmarks one by one and checking what data points they
15+
# report), so here's a hand-written mapping instead. Benchmarks that use
16+
# their own name for the data point are omitted. This will probably (slowly)
17+
# get out of date, but so be it.
18+
#
19+
# (Sorted by value, then key.)
20+
DATAPOINT_TO_BENCHMARK = {
21+
"many_optionals": "argparse",
22+
"subparsers": "argparse",
23+
"async_tree_none": "async_tree",
24+
"async_tree_none_tg": "async_tree_tg",
25+
"bench_mp_pool": "concurrent_imap",
26+
"bench_thread_pool": "concurrent_imap",
27+
"deepcopy_memo": "deepcopy",
28+
"deepcopy_reduce": "deepcopy",
29+
"create_gc_cycles": "gc_collect",
30+
"genshi_text": "genshi",
31+
"genshi_xml": "genshi",
32+
"logging_format": "logging",
33+
"logging_silent": "logging",
34+
"logging_simple": "logging",
35+
"shortest_path": "networkx",
36+
"connected_components": "networkx_connected_components",
37+
"k_core": "networkx_k_core",
38+
"pprint_pformat": "pprint",
39+
"pprint_safe_repr": "pprint",
40+
"scimark_fft": "scimark",
41+
"scimark_lu": "scimark",
42+
"scimark_monte_carlo": "scimark",
43+
"scimark_sor": "scimark",
44+
"scimark_sparse_mat_mult": "scimark",
45+
"sqlglot_v2_normalize": "sqlglot_v2",
46+
"sympy_expand": "sympy",
47+
"sympy_integrate": "sympy",
48+
"sympy_str": "sympy",
49+
"sympy_sum": "sympy",
50+
"xml_etree_generate": "xml_etree",
51+
"xml_etree_iterparse": "xml_etree",
52+
"xml_etree_parse": "xml_etree",
53+
"xml_etree_process": "xml_etree",
54+
}
55+
56+
# The list of bm_* directories in pyperformance and pyston-benchmarks, plus
57+
# the aliases defined in their MANIFEST files (entries with
58+
# '<local:$dirname>')
59+
KNOWN_BENCHMARKS = {
60+
"2to3",
61+
"aiohttp",
62+
"argparse",
63+
"argparse_subparsers",
64+
"async_generators",
65+
"async_tree",
66+
"async_tree_cpu_io_mixed",
67+
"async_tree_cpu_io_mixed_tg",
68+
"async_tree_eager",
69+
"async_tree_eager_cpu_io_mixed",
70+
"async_tree_eager_cpu_io_mixed_tg",
71+
"async_tree_eager_io",
72+
"async_tree_eager_io_tg",
73+
"async_tree_eager_memoization",
74+
"async_tree_eager_memoization_tg",
75+
"async_tree_eager_tg",
76+
"async_tree_io",
77+
"async_tree_io_tg",
78+
"async_tree_memoization",
79+
"async_tree_memoization_tg",
80+
"async_tree_tg",
81+
"asyncio_tcp",
82+
"asyncio_tcp_ssl",
83+
"asyncio_websockets",
84+
"bpe_tokeniser",
85+
"chameleon",
86+
"chaos",
87+
"comprehensions",
88+
"concurrent_imap",
89+
"coroutines",
90+
"coverage",
91+
"crypto_pyaes",
92+
"dask",
93+
"decimal_factorial",
94+
"decimal_pi",
95+
"deepcopy",
96+
"deltablue",
97+
"django_template",
98+
"djangocms",
99+
"docutils",
100+
"dulwich_log",
101+
"fannkuch",
102+
"flaskblogging",
103+
"float",
104+
"gc_collect",
105+
"gc_traversal",
106+
"generators",
107+
"genshi",
108+
"gevent_hub",
109+
"go",
110+
"gunicorn",
111+
"hexiom",
112+
"hg_startup",
113+
"html5lib",
114+
"json",
115+
"json_dumps",
116+
"json_loads",
117+
"kinto",
118+
"logging",
119+
"mako",
120+
"mdp",
121+
"meteor_contest",
122+
"mypy2",
123+
"nbody",
124+
"networkx",
125+
"networkx_connected_components",
126+
"networkx_k_core",
127+
"nqueens",
128+
"pathlib",
129+
"pickle",
130+
"pickle_dict",
131+
"pickle_list",
132+
"pickle_pure_python",
133+
"pidigits",
134+
"pprint",
135+
"pycparser",
136+
"pyflate",
137+
"pylint",
138+
"python_startup",
139+
"python_startup_no_site",
140+
"pytorch_alexnet_inference",
141+
"raytrace",
142+
"regex_compile",
143+
"regex_dna",
144+
"regex_effbot",
145+
"regex_v8",
146+
"richards",
147+
"richards_super",
148+
"scimark",
149+
"spectral_norm",
150+
"sphinx",
151+
"sqlalchemy_declarative",
152+
"sqlalchemy_imperative",
153+
"sqlglot_v2",
154+
"sqlglot_v2_optimize",
155+
"sqlglot_v2_parse",
156+
"sqlglot_v2_transpile",
157+
"sqlite_synth",
158+
"sympy",
159+
"telco",
160+
"thrift",
161+
"tomli_loads",
162+
"tornado_http",
163+
"typing_runtime_protocols",
164+
"unpack_sequence",
165+
"unpickle",
166+
"unpickle_list",
167+
"unpickle_pure_python",
168+
"xml_etree",
169+
}
170+
171+
172+
def parse_result(results_file, benchmark_data):
173+
with results_file.open() as f:
174+
result = json.load(f)
175+
bms = result["benchmarks"]
176+
if len(bms) == 1 and "metadata" not in bms[0]:
177+
# Sometimes a .json file contains just a single benchmark.
178+
bms = [result]
179+
for bm in bms:
180+
if "metadata" not in bm:
181+
raise RuntimeError(f"Invalid data {bm.keys()!r} in {results_file}")
182+
return
183+
name = bm["metadata"]["name"]
184+
name = DATAPOINT_TO_BENCHMARK.get(name, name)
185+
assert name is not None # to satisfy pyright.
186+
if name not in KNOWN_BENCHMARKS:
187+
print(
188+
f"WARNING: unknown benchmark {name!r} in {results_file}",
189+
file=sys.stderr,
190+
)
191+
# Avoid repeated warnings.
192+
KNOWN_BENCHMARKS.add(name)
193+
benchmark_data[name].append(bm["metadata"]["loops"])
194+
195+
196+
def _main(
197+
loops_file: pathlib.Path,
198+
update: bool,
199+
overwrite: bool,
200+
merger: str,
201+
results: Iterable[pathlib.Path],
202+
):
203+
if not update and not overwrite and loops_file.exists():
204+
raise OSError(
205+
errno.EEXIST,
206+
f"{loops_file} exists (use -f to overwrite, -u to merge data)",
207+
)
208+
benchmark_data = collections.defaultdict(list)
209+
if update:
210+
parse_result(loops_file, benchmark_data)
211+
for result_file in results:
212+
parse_result(result_file, benchmark_data)
213+
214+
merge_func = {
215+
"max": max,
216+
"min": min,
217+
}[merger]
218+
219+
# pyperformance expects a specific layout, and needs the top-level
220+
# metadata even if it's empty.
221+
loops_data = {"benchmarks": [], "metadata": {}}
222+
for bm in sorted(benchmark_data):
223+
loops = merge_func(benchmark_data[bm])
224+
bm_result = {"metadata": {"name": bm, "loops": loops}}
225+
loops_data["benchmarks"].append(bm_result)
226+
with loops_file.open("w") as f:
227+
json.dump(loops_data, f, sort_keys=True, indent=4)
228+
f.write("\n")
229+
230+
231+
def main():
232+
parser = argparse.ArgumentParser(
233+
description="""
234+
Synthesize a loops.json file for use with `pyperformance`'s
235+
`--same-loops` (or `PYPERFORMANCE_LOOPS_FILE`) from one or more
236+
benchmark results.
237+
""",
238+
formatter_class=rich_argparse.ArgumentDefaultsRichHelpFormatter,
239+
)
240+
parser.add_argument(
241+
"-o", "--loops_file", help="loops file to write to", required=True
242+
)
243+
group = parser.add_mutually_exclusive_group(required=False)
244+
group.add_argument(
245+
"-u", "--update", action="store_true", help="add to existing loops file"
246+
)
247+
group.add_argument(
248+
"-f", "--overwrite", action="store_true", help="replace loops file"
249+
)
250+
parser.add_argument(
251+
"-s",
252+
"--select",
253+
choices=("max", "min"),
254+
default="max",
255+
help="how to merge multiple runs",
256+
)
257+
parser.add_argument("results", nargs="+", help="benchmark results to parse")
258+
args = parser.parse_args()
259+
260+
_main(
261+
pathlib.Path(args.loops_file),
262+
args.update,
263+
args.overwrite,
264+
args.select,
265+
[pathlib.Path(r) for r in args.results],
266+
)
267+
268+
269+
if __name__ == "__main__":
270+
main()

tests/test_synthesize_loops_file.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import json
2+
import pathlib
3+
import sys
4+
import tempfile
5+
6+
import pytest
7+
8+
from bench_runner.scripts import synthesize_loops_file
9+
10+
DATA_PATH = pathlib.Path(__file__).parent / "data"
11+
12+
13+
def run_synthesize(
14+
output: pathlib.Path,
15+
datadir: pathlib.Path,
16+
*,
17+
update: bool = False,
18+
overwrite: bool = False,
19+
merger: str = "max",
20+
):
21+
files = datadir.glob("results/**/*.json")
22+
synthesize_loops_file._main(
23+
loops_file=output,
24+
update=update,
25+
overwrite=overwrite,
26+
merger=merger,
27+
results=files,
28+
)
29+
30+
31+
def check_loops(output: pathlib.Path):
32+
with output.open() as f:
33+
data = json.load(f)
34+
assert "benchmarks" in data
35+
assert "metadata" in data
36+
seen = set()
37+
for bm in data["benchmarks"]:
38+
assert "metadata" in bm
39+
assert "loops" in bm["metadata"]
40+
assert isinstance(bm["metadata"]["loops"], int)
41+
assert "name" in bm["metadata"]
42+
assert (name := bm["metadata"]["name"]) not in seen
43+
assert isinstance(name, str)
44+
seen.add(name)
45+
data["benchmarks"].sort(key=lambda item: item["metadata"]["name"])
46+
return data
47+
48+
49+
def set_loops(output, value):
50+
with output.open() as f:
51+
data = json.load(f)
52+
for bm in data["benchmarks"]:
53+
bm["metadata"]["loops"] = value
54+
with output.open("w") as f:
55+
json.dump(data, f, sort_keys=True, indent=4)
56+
57+
58+
def test_synthesize():
59+
with tempfile.TemporaryDirectory() as tmpdir:
60+
output = pathlib.Path(tmpdir) / "loops.json"
61+
run_synthesize(output, DATA_PATH)
62+
expected_data = check_loops(output)
63+
64+
with pytest.raises(FileExistsError):
65+
run_synthesize(output, DATA_PATH)
66+
67+
run_synthesize(output, DATA_PATH, update=True)
68+
assert expected_data == check_loops(output)
69+
70+
set_loops(output, 0)
71+
run_synthesize(output, DATA_PATH, update=True)
72+
assert expected_data == check_loops(output)
73+
74+
set_loops(output, sys.maxsize)
75+
run_synthesize(output, DATA_PATH, overwrite=True)
76+
assert expected_data == check_loops(output)
77+
78+
run_synthesize(output, DATA_PATH, overwrite=True, merger="min")
79+
expected_data = check_loops(output)
80+
set_loops(output, sys.maxsize)
81+
run_synthesize(output, DATA_PATH, update=True, merger="min")
82+
assert expected_data == check_loops(output)

0 commit comments

Comments
 (0)