-
-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathcachegrind.py
executable file
·139 lines (102 loc) · 4.4 KB
/
cachegrind.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
# Based on https://github.com/pythonspeed/cachegrind-benchmarking/blob/main/cachegrind.py
"""
Run a program under Cachegrind, combining various metrics into one single performance metric.
License: https://opensource.org/licenses/MIT
## Features
* Disables ASLR.
* Sets consistent cache sizes.
* Calculates a combined performance metric.
For more information see the detailed write up at:
https://pythonspeed.com/articles/consistent-benchmarking-in-ci/
## Usage
$ python3 cachegrind.py ./yourprogram --yourparam=yourvalues
If you're benchmarking Python, make sure to set PYTHONHASHSEED to a fixed value
(e.g. `export PYTHONHASHSEED=1234`). Other languages may have similar
requirements to reduce variability.
The last line printed will be a combined performance metric, but you can tweak
the script to extract more info, or use it as a library.
Copyright © 2020, Hyphenated Enterprises LLC.
"""
from __future__ import annotations
import sys
import typing as t
from subprocess import DEVNULL
from subprocess import check_call
from subprocess import check_output
from subprocess import run
from tempfile import NamedTemporaryFile
try:
check_call(['setarch', '-h'], stdout=DEVNULL, stderr=DEVNULL)
check_call(['valgrind', '-h'], stdout=DEVNULL, stderr=DEVNULL)
except FileNotFoundError as exc: # e.g. macOS
raise SystemExit(f'Command not found: {exc.filename}') from None
ARCH = check_output(['uname', '-m'], text=True).strip()
DISABLE_ASLR_CMD = ['setarch', ARCH, '-R']
def run_with_cachegrind(args_list: list[str]) -> dict[str, int]:
"""
Run the the given program and arguments under Cachegrind, parse the
Cachegrind specs.
For now we just ignore program output, and in general this is not robust.
"""
temp_file = NamedTemporaryFile('r+')
run([
*DISABLE_ASLR_CMD,
'valgrind',
'--tool=cachegrind',
# Set some reasonable L1 and LL values, based on Haswell.
# Feel free to update, important part is that they are consistent across runs,
# instead of the default of copying from the current machine.
'--I1=32768,8,64',
'--D1=32768,8,64',
'--LL=8388608,16,64',
'--cachegrind-out-file=' + temp_file.name,
*args_list,
]) # Don't fail if the program fails (to support e.g. `pytest --benchmark-compare-fail=...`)
return parse_cachegrind_output(temp_file)
def parse_cachegrind_output(temp_file: t.IO[str]) -> dict[str, int]:
header = summary = ''
for line in temp_file:
if line.startswith('events: '):
header = line[len('events: ') :].strip()
elif line.startswith('summary: '):
summary = line[len('summary:') :].strip()
assert header
assert summary
return dict(zip(header.split(), (int(i) for i in summary.split())))
def get_counts(cg_results: dict[str, int]) -> dict[str, int]:
"""
Given the result of run_with_cachegrind(), figure out the parameters we will use for final
estimate.
We pretend there's no L2 since Cachegrind doesn't currently support it.
Caveats: we're not including time to process instructions, only time to
access instruction cache(s), so we're assuming time to fetch and run_with_cachegrind
instruction is the same as time to retrieve data if they're both to L1
cache.
"""
result = {}
d = cg_results
ram_hits = d['DLmr'] + d['DLmw'] + d['ILmr']
l3_hits = d['I1mr'] + d['D1mw'] + d['D1mr'] - ram_hits
total_memory_rw = d['Ir'] + d['Dr'] + d['Dw']
l1_hits = total_memory_rw - l3_hits - ram_hits
assert total_memory_rw == l1_hits + l3_hits + ram_hits
result['l1'] = l1_hits
result['l3'] = l3_hits
result['ram'] = ram_hits
return result
def combined_instruction_estimate(counts: dict[str, int]) -> int:
"""
Given the result of run_with_cachegrind(), return estimate of total time to run_with_cachegrind.
Multipliers were determined empirically, but some research suggests they're
a reasonable approximation for cache time ratios. L3 is probably too low,
but then we're not simulating L2...
"""
return counts['l1'] + (5 * counts['l3']) + (35 * counts['ram'])
def main() -> None:
results = run_with_cachegrind(sys.argv[1:])
counts = get_counts(results)
estimate = combined_instruction_estimate(counts)
print(f'{"*" * 80}\nCombined instruction estimate: {estimate:,}') # noqa: T201
if __name__ == '__main__':
main()