-
Notifications
You must be signed in to change notification settings - Fork 0
/
brute_force_slp_FAIL.py
192 lines (150 loc) · 5.97 KB
/
brute_force_slp_FAIL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""
Implementation of Larsen and Amarasingh's SLP paper from PLDI '00
I've chosen a bit of a different approach here I'm going to identify longest
runs of code that are isomorphic and independent
From that longest run, I go forward and backward usign use-def and def-use
chains as Larsen and Amarasingh do.
Then I generate vector instructions for these packs. The packs are generated in a
topological order. There is guaranteed to be a topological ordering assuming there
are no phi-nodes.
I am going to try to use a vector pack as many times as possible, to avoid loading
and unloading to and from vector packs. This is the main difference when compared
to naive vectorization, which does not attempt to reuse packs at all.
"""
import click
from itertools import combinations
import json
import sys
from bril_core_constants import *
from bril_core_utilities import *
from cfg import form_cfg_w_blocks, join_cfg
from ssa import bril_to_ssa, ssa_to_bril
from vectorization_utilities import *
UNDEFINED = "undefined"
UNDEFINED_INSTR = {
ARGS: [UNDEFINED, UNDEFINED],
DEST: UNDEFINED,
TYPE: UNDEFINED,
OP: UNDEFINED,
}
MAX_NUM_RUNS = 5
def filter_undefined_packs(vector_packs):
new_vector_packs = []
for vector_perm in vector_packs:
new_vector_perm = []
for pack_pair in vector_perm:
all_undefined = True
for (fst, snd) in pack_pair:
if fst != UNDEFINED or snd != UNDEFINED:
all_undefined = False
if not all_undefined:
new_vector_perm.append(pack_pair)
new_vector_packs.append(new_vector_perm)
return new_vector_packs
def generate_cartesian_product_of_combinations(vector_perms):
"""
Generate Cartesian Product of Combinations, with each element of a tuple representing the packing for each run
Incredibly Expensive when the length of vector_perms is large
"""
if len(vector_perms) == 0:
return []
elif len(vector_perms) == 1:
first_vector_of_permutations = vector_perms[0]
return list(map(lambda e: [e], first_vector_of_permutations))
final_perms = []
first_vector_of_permutations = vector_perms[0]
remainder_permutations = generate_cartesian_product_of_combinations(
vector_perms[1:])
for perm in first_vector_of_permutations:
for remainder_vector_perm in remainder_permutations:
new_perm = [perm] + remainder_vector_perm
final_perms.append(new_perm)
return final_perms
def brute_force_alignment_vec_size(runs):
# TODO Handle Runs size less than or equal to 5
assert len(runs) <= MAX_NUM_RUNS
# pad each run with undefined variables to bring it to VECTOR_WIDTH
padded_runs = []
for run in runs:
padded_run = []
assert len(run) <= VECTOR_LANE_WIDTH
for instr in run:
padded_run.append(instr)
for _ in range(VECTOR_LANE_WIDTH - len(run)):
padded_run.append(UNDEFINED_INSTR)
padded_runs.append(padded_run)
assert len(padded_runs) == len(runs) <= MAX_NUM_RUNS
packed_pairs = []
for run in padded_runs:
left_vec = []
right_vec = []
assert len(run) == VECTOR_LANE_WIDTH
for instr in run:
assert ARGS in instr
args = instr[ARGS]
assert len(args) == 2
left_vec.append(args[0])
right_vec.append(args[1])
packed_pairs.append(list(zip(left_vec, right_vec)))
assert len(packed_pairs) == len(padded_runs) == len(runs) <= MAX_NUM_RUNS
# generate all combinations of vector pack members
packed_pair_combinations = []
for vector_pair in packed_pairs:
combination_options = []
# choose 2, 3, ... VECTOR_LANE_WIDTH of elements as a pack
for i in range(2, VECTOR_LANE_WIDTH + 1):
combination_of_packs = combinations(vector_pair, i)
combination_options += list(combination_of_packs)
packed_pair_combinations.append(combination_options)
assert len(packed_pair_combinations) == len(packed_pairs) == len(
padded_runs) == len(runs) <= MAX_NUM_RUNS
# filter out all packs that are clearly all undefined!
no_undefined_packs = filter_undefined_packs(packed_pair_combinations)
# generate product pack candidates
product_pack_candidates = generate_cartesian_product_of_combinations(
no_undefined_packs)
# filter candidates for those with most pack reuse
pack_reused_candidates = []
for pack_candidate in product_pack_candidates:
reused = len(pack_candidate) - len(set(pack_candidate))
if reused > 0:
pack_reused_candidates.append(pack_candidate)
if pack_reused_candidates == []:
return product_pack_candidates[0]
return pack_reused_candidates[0]
def slp_basic_block(basic_block_instrs):
"""
Perform SLP on a basic block
"""
# build run of vectorizable instruction
runs = build_runs(basic_block_instrs)
# brute force enumerate every possible alignment and vector size, given the runs
pack_candidate = brute_force_alignment_vec_size(runs)
print(pack_candidate)
def slp_func(func):
cfg = form_cfg_w_blocks(func)
for basic_block in cfg:
new_instrs = slp_basic_block(
cfg[basic_block][INSTRS])
cfg[basic_block][INSTRS] = new_instrs
final_instrs = join_cfg(cfg)
func[INSTRS] = final_instrs
return func
def slp_prog(prog):
ssa_prog = bril_to_ssa(prog)
for func in ssa_prog[FUNCTIONS]:
slp_func(func)
bril_prog = ssa_to_bril(ssa_prog)
return bril_prog
@click.command()
@click.option('--pretty-print', default=False, help='Pretty Print Before and After SLP Vectorization.')
def main(pretty_print):
prog = json.load(sys.stdin)
if pretty_print:
print(json.dumps(prog, indent=4, sort_keys=True))
final_prog = slp_prog(prog)
if pretty_print:
print(json.dumps(final_prog, indent=4, sort_keys=True))
print(json.dumps(final_prog))
if __name__ == "__main__":
main()