Halide 16.0.0
Halide compiler and libraries
LoopNest.h
Go to the documentation of this file.
1/** This file defines the LoopNest, which is our
2 * representation of a Halide schedule, and contains methods to
3 * generate candidates for scheduling as well as extract a
4 * featurization that can be used to cost each candidate. */
5
6#ifndef LOOP_NEST_H
7#define LOOP_NEST_H
8
9#include "ASLog.h"
10#include "CostModel.h"
11#include "FunctionDAG.h"
12#include "GPULoopInfo.h"
13#include "GPUMemInfo.h"
14#include "PerfectHashMap.h"
15#include "SearchSpaceOptions.h"
16#include "Statistics.h"
17#include "ThreadInfo.h"
18#include "Tiling.h"
19#include <set>
20#include <vector>
21
22namespace Halide {
23namespace Internal {
24namespace Autoscheduler {
25
26template<typename T>
28
29template<typename T>
31
33 Thread,
34 Serial,
35 Simd,
37 None };
38
39std::string stringify(GPU_parallelism label);
40
41// inlined => func is inlined so has no memory store location
42enum class GPUMemoryType { Global,
43 Shared,
44 Local,
46 Inlined };
47
48bool may_subtile(const Anderson2021Params &params);
49
51
53
55
57 return 128;
58}
59
60int get_unroll_limit(const Target &target);
61
62bool in_range_zero_one(double x);
63
64bool are_valid_thread_extents(const vector<int64_t> &counts);
65
68
69bool all(const vector<int> &v);
70bool accessed_at_constant_indices(const std::vector<int> &unrolled, const FunctionDAG::Edge *e);
71
72// We're going to do a tree search over possible schedules to find an
73// optimal one. A tree search requires a state, and a function that
74// gives you children of the state (with costs). The following struct
75// represents the state, which is a partial schedule.
76//
77// A partial schedule is a tree. Each node is some portion of the for
78// loop nest of some Func. If there are no children, it's the
79// innermost set of loops. If there are children, it's a loop over
80// tiles of that Func.
81struct LoopNest {
82 mutable RefCount ref_count;
83
84 // The extents of this loop. Put another way, the number of tiles,
85 // not the size of each tile.
86 vector<int64_t> size;
87
88 // The nodes inside the loop body
89 vector<IntrusivePtr<const LoopNest>> children;
90
91 // Funcs inlined into this inner loop, and the number of times
92 // each is called. Only valid if children is empty.
94
95 // Funcs stored inside this loop
96 std::set<const FunctionDAG::Node *> store_at;
97
98 // The total bounds required of any given Func over all iterations
99 // of this loop. In the paper, this is represented using the
100 // little boxes to the left of the loop nest tree figures.
101 mutable NodeMap<Bound> bounds;
102
103 // The Func this loop nest belongs to
104 const FunctionDAG::Node *node = nullptr;
105
106 // The stage of the Func
107 const FunctionDAG::Node::Stage *stage = nullptr;
108
109 // Is this the innermost loop of this func (the SIMD loop)?
110 bool innermost = false;
111
112 // Are we permitted to tile this loop?
113 bool tileable = false;
114
115 // Is this the parallel outer loop?
116 bool parallel = false;
117
118 // What dimension is this Func vectorized over, in terms of the pure args of the Func?
119 int vector_dim = -1;
120
121 // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them.
122 int vectorized_loop_index = -1;
123
124 // Apply gpu threads to this loop nest
126
137 };
138
139 mutable std::map<uint64_t, StageMap<StageMap<FeatureIntermediates>>> feature_intermediates;
140 mutable std::map<uint64_t, StageMap<ScheduleFeatures>> features;
141
142 bool is_gpu_serial(const Target &target) const {
144 }
145
146 bool is_gpu_thread(const Target &target) const {
148 }
149
150 bool is_gpu_block(const Target &target) const {
152 }
153
154 bool is_scalar() const {
155 return size.empty();
156 }
157
158 // given a newly inserted node f into this LoopNest, get union of thread counts in each dimension
159 // across all siblings of f.
160 vector<int64_t> get_union_thread_counts(const FunctionDAG::Node *f) const;
161
162 // given a newly inserted node f into this LoopNest, gets the size of
163 // all of f's stages and their pure_dim indices
165 vector<vector<int64_t>> &stage_sizes,
166 vector<vector<int>> &pure_dims,
167 vector<int> &vectorized_indices) const;
168
169 // given the loop nest of a stage to parallelize at root, figure out if using odd tile sizes
170 // for the vectorized dimension will allow the resulting thread tiles to be multiples of 32
171 // if so, we will include these in the serial loop sizes
172 void generate_vec_dim_serial_tilings(vector<int> &serial_sizes) const;
173
174 // get the loop nests of a newly inserted node, f, that is marked GPU threads. Tiles
175 // the newly inserted loop nests of f into a threads loop outside a serial loop.
176 // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result.
178 const Anderson2021Params &params,
179 const Target &target,
180 int v,
181 vector<IntrusivePtr<const LoopNest>> &result,
182 const vector<int64_t> &max_size);
183
184 void copy_from(const LoopNest &n);
186
187 static void hash_combine(uint64_t &h, uint64_t next) {
188 // From boost
189 h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
190 }
191
192 // Hash the loop structure and sizes up to a fixed depth. This is
193 // used as the hash function for the coarse-to-fine beam search in
194 // the paper.
195 void structural_hash(uint64_t &h, int depth) const;
196
197 // How many funcs are scheduled inside this loop level. Used in
198 // the structural hash.
200 size_t count = inlined.size() + store_at.size();
201 for (const auto &c : children) {
202 count += c->funcs_realized_or_inlined();
203 }
204 return count;
205 }
206
207 // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage.
208 struct Sites {
209 const LoopNest *compute = nullptr; // Its containing compute_at site
210 const LoopNest *store = nullptr; // Its containing store_at site
211 const LoopNest *produce = nullptr; // Its own outermost node
212 const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop
213 const LoopNest *task = nullptr; // The parallel for loop it belongs to
214 const LoopNest *thread = nullptr; // Its containing gpu_thread loop
215 GPUMemoryType gpu_store_memory_type; // global, local, shared?
216 int64_t allocation_size = 0; // Allocation size in bytes
217 bool is_constant_allocation = false; // Does the allocation have constant size?
218 int64_t num_realizations = 0; // Number of times this stage is realized. Only valid for unscheduled producers
219 bool inlined = false; // Is the Func inlined?
220 std::vector<const LoopNest *> inlined_innermosts; // Is the Func inlined?
222
225 }
228 }
231 }
234 }
235 };
236
237 GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined = false) const;
238
239 std::vector<int> unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const;
240
242 StageMap<Sites> &sites,
243 NodeMap<bool> &can_be_promoted_to_registers,
244 const LoopNest *grandparent,
245 const LoopNest *parent) const;
246
247 bool promote_allocs_to_registers(const Target &target, StageMap<Sites> &sites) const;
248
249 // Compute all the sites of interest for each pipeline stage
250 void get_sites(const Target &target,
251 StageMap<Sites> &sites,
252 StageMap<int64_t> &shared_mem_alloc_sizes,
253 const LoopNest *task = nullptr,
254 const LoopNest *parent = nullptr,
255 const LoopNest *current_thread_loop = nullptr) const;
256
257 // A helper for the working_set_at_task feature. Most features are
258 // computed in the recursive pass 'compute_features' below, but
259 // this one must be done in a second separate recursive pass.
262 for (const auto &c : children) {
263 c->set_working_set_at_task_feature(working_set, features);
264 features->get(c->stage).working_set_at_task = working_set;
265 }
266 }
267
268 bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const;
269
271
272 bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const;
273
275
277
279
280 // Get the stride over "node's" storage for a unit increment in the vectorized loop's
281 // index
282 double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const;
283
284 Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose = false) const;
285
286 bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const;
287
288 int get_actual_vector_dim(const Bound &store_bounds) const;
289
290 void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector<int64_t> &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose = false) const;
291
292 bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const;
293
294 bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const;
295
296 int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const;
297
298 int vectorized_access_size(size_t loop_index, bool verbose = false) const;
299
300 template<typename T>
301 void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose = false) const;
302
303 std::pair<double, double> compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const;
304
305 template<typename T>
306 MemInfoType<T> compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
307
308 template<typename T>
309 void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType<T> &mem_info, double serial_loop_extents, bool verbose = false) const;
310
311 double compute_local_mem_stride(double stride, double bytes) const;
312
313 // Assumes block, serial, thread or block, thread nesting
314 const LoopNest *get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const;
315
316 std::pair<int64_t, int64_t> get_block_and_serial_extents(const LoopNest *block) const;
317
319
321
322 void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const;
323
324 // Assume that when a block is active, all its warps are active
325 void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const;
326
327 void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const;
328
329 std::pair<const LoopNest *, const LoopNest *> find_innermost_and_parent() const;
330
331 int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector<const FunctionDAG::Edge *> &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const;
332
333 int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const;
334
336
337 vector<pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;
338
340
341 void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;
342
344
347
349
350 std::pair<int64_t, bool> compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const;
351
352 // Do a recursive walk over the loop nest computing features to feed the cost model.
354 const Anderson2021Params &params,
355 const Target &target,
356 const StageMap<Sites> &sites,
357 int64_t instances,
358 int64_t parallelism,
359 const LoopNest *parent,
360 const LoopNest *grandparent,
361 const LoopNest &root,
362 GPULoopInfo gpu_loop_info,
363 bool use_memoized_features,
364 const StageMap<int64_t> &total_shared_mem_alloc_sizes,
365 int64_t *working_set,
366 int64_t *working_set_local_constant,
367 int64_t *working_set_local_dynamic,
369 Statistics &stats,
370 bool verbose = false) const;
371
372 bool is_root() const {
373 // The root is the sole node without a Func associated with
374 // it.
375 return node == nullptr;
376 }
377
378 // Set the region required of a Func at this site.
379 const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const {
380 return bounds.emplace(f, b);
381 }
382
383 // Get the region required of a Func at this site, from which we
384 // know what region would be computed if it were scheduled here,
385 // and what its loop nest would be.
386 const Bound &get_bounds(const FunctionDAG::Node *f) const;
387
388 // Get the region required of a Func at this site (but only to satisfy the
389 // consumers along the given edge chain), from which we know what region
390 // would be computed if it were scheduled here and what its loop nest
391 // would be.
392 Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector<const FunctionDAG::Edge *> &edge_chain) const;
393
394 void dump() const;
395
396 std::string to_string() const;
397
398 // Recursively print a loop nest representation to stderr
399 template<typename T>
400 void dump(T &stream, string prefix, const LoopNest *parent) const;
401
402 // Does this loop nest access the given Func
403 bool calls(const FunctionDAG::Node *f) const;
404
405 // What is the maximum number of inlined calls to a Func that
406 // occur within this loop. Used to prune states that would
407 // generate too much code.
409
410 // Does this loop nest access an input buffer? Used to select
411 // trail strategies when splitting loops. We don't want to read
412 // out of bounds on inputs, even if we don't intend to use the
413 // values read. It could create annoying assertion failures for
414 // the user. It's OK to read out of range of the values computed
415 // on internal Funcs though. Allocation bounds inference just pads
416 // out the bounds so that it won't fault.
418
419 // Does this loop nest contain a computation of the given Func.
420 bool computes(const FunctionDAG::Node *f) const;
421
422 // Above here most methods query the loop nest. Below we have
423 // methods that mutate the loop nest.
424
425 // Inline a Func into all consumers within this loop.
427
428 // Compute a Func at this site.
430 bool tileable,
431 int v,
432 bool in_threads_loop,
433 const Anderson2021Params &params,
434 const Target &target);
435
436 // Parallelize this loop according to the given tiling.
438 const LoopNest *parent,
439 const Anderson2021Params &params,
440 const Target &target,
441 bool inner_tiling,
442 bool adjust_tiling,
443 bool move_all_rvars_inward = true,
444 const vector<int> &rvars_to_move_inward = {}) const;
445
446 int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false, bool in_threads_loop = false) const;
448
449 // All store ats further in than the block level must be fixed
450 // sized allocations. This method checks if f will require a dynamic
451 // allocation
452 bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const;
453
454 // Return all possible ways to compute f in tiles somewhere within
455 // this loop nest.
456 // in_threads_loop tracks whether or not function is going to be placed inside a
457 // loop marked gpu_threads, in which case f's loops cannot be gpu_threads
458 vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,
459 const LoopNest *parent,
460 const Anderson2021Params &params,
461 const Target &target,
462 const SearchSpaceOptions &search_space_options,
463 int v,
464 bool in_realization,
465 bool in_threads_loop,
466 bool is_pre_pass,
467 vector<int64_t> union_counts = vector<int64_t>()) const;
468
469 // Below here we have methods that apply a schedule to a Halide pipeline.
470
471 // A model of the state of the loop nest of a Func while applying
472 // Halide's scheduling directives.
473
474 // Note that StageScheduleState is movable-but-not-copyable thanks to its ostringstream member.
475 struct StageScheduleState {
476 // How much parallelism do we need to exploit with this Func?
477 double num_cores = 0;
478
479 // Which storage dimension is vectorized? We need to reorder it innermost
480 int vector_dim = -1;
481 int vectorized_loop_index = -1;
482
483 // The various Vars and RVars used for scheduling a Func.
484 struct FuncVar {
485 // The top-level var or rvar this was split off from
487
488 // This var.
490
491 // Source code to access this Var/RVar. Used for printing
492 // valid Halide source for this schedule.
493 string accessor;
494
495 // Our estimate of the extent of this var. This is exact
496 // when constant_extent flag is true.
497 int64_t extent = 0;
498
499 // Which index in the symbolic loop nest does this var
500 // belong to.
501 size_t index = 0;
502
503 // Some flags.
504 bool innermost_pure_dim = false,
505 outermost = false,
506 parallel = false,
507 exists = false,
508 pure = false,
509 constant_extent = false;
510
511 bool vectorized = false;
512 bool gpu_threads = false;
513
515 : orig(Var()), var(Var()) {
516 }
517 };
520 bool parallel = false;
521 bool vectorized = false;
524
525 // In order from innermost to outermost. Each group of d is one tiling level.
526 vector<FuncVar> vars;
527
528 // In order from innermost to outermost. Each group of d is one tiling level.
529 vector<FuncVar> ordered_vars;
530 vector<int64_t> gpu_thread_extents;
531
533
534 // From outermost in
535 vector<StageScheduleState *> ancestors;
536
537 std::ostringstream schedule_source;
538 };
539
544 int num_serial_loops() const;
546
547 void update_producers_to_be_staged(StageScheduleState &state, const NodeMap<bool> &all_inlined) const;
548 bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const;
549
550 // Apply the schedule represented by this loop nest to a Halide pipeline.
551 void apply(LoopLevel here,
552 StageMap<std::unique_ptr<StageScheduleState>> &state_map,
553 double num_cores,
554 int depth,
555 const LoopNest *parent,
556 const LoopNest *compute_site,
557 const Target &target,
558 std::vector<StageScheduleState *> &ancestors,
559 const NodeMap<bool> &all_inlined) const;
560
561 double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const;
562
564
565 void collect_nodes_that_should_be_inlined(const NodeMap<bool> &nodes_to_freeze, NodeMap<bool> &inlined_nodes) const;
566
567 void collect_all_inlined(NodeMap<bool> &all_inlined) const;
568
570 int64_t product_of_descendants(int loop_index) const;
571
572 void get_stages_computed_in_each_compute_root_loop(StageMap<StageMap<bool>> &descendants, const LoopNest *compute_root_loop_nest = nullptr) const;
573};
574
575struct Filter {
577 bool logging = false;
578
581 if (logging) {
582 std::cerr << "\nState filtered: \n";
583 loop_nest->dump();
584 std::cerr << "Reason: ";
585 }
586 }
587
588 template<typename T>
590 if (logging) {
591 std::cerr << std::forward<T>(x);
592 }
593 return *this;
594 }
595
597};
598
599} // namespace Autoscheduler
600} // namespace Internal
601} // namespace Halide
602
603#endif // LOOP_NEST_H
Data structure containing information about the current GPU loop nest hierarchy of blocks,...
Data structures that help track memory access information.
Data structure containing information about GPU threads for a particular location in the loop nest an...
A class representing a reference count to be used with IntrusivePtr.
Definition: IntrusivePtr.h:19
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:176
A Halide variable, to be used when defining functions.
Definition: Var.h:19
int64_t get_active_block_hardware_limit(const Anderson2021Params &params)
bool all(const vector< int > &v)
bool are_valid_thread_extents(const vector< int64_t > &counts)
bool accessed_at_constant_indices(const std::vector< int > &unrolled, const FunctionDAG::Edge *e)
constexpr int64_t get_register_mem_alloc_limit()
Definition: LoopNest.h:56
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
Definition: LoopNest.h:24
PerfectHashMap< FunctionDAG::Node, T > NodeMap
Definition: LoopNest.h:21
int64_t get_active_warp_hardware_limit(const Anderson2021Params &params)
bool may_subtile(const Anderson2021Params &params)
int get_unroll_limit(const Target &target)
int64_t get_shared_memory_limit(const Anderson2021Params &params)
std::string stringify(GPU_parallelism label)
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
Filter(const LoopNest *loop_nest)
Definition: LoopNest.h:579
std::vector< const LoopNest * > inlined_innermosts
Definition: LoopNest.h:220
NodeMap< std::vector< std::pair< const LoopNest *, std::vector< const FunctionDAG::Edge * > > > > producers_to_be_staged
Definition: LoopNest.h:532
vector< pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
bool is_gpu_thread(const Target &target) const
Definition: LoopNest.h:146
vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Anderson2021Params &params, const Target &target, const SearchSpaceOptions &search_space_options, int v, bool in_realization, bool in_threads_loop, bool is_pre_pass, vector< int64_t > union_counts=vector< int64_t >()) const
const LoopNest * get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const
int num_serial_loops(const FunctionDAG::Node::Stage *stage) const
bool has_constant_region_required(const FunctionDAG::Node *node) const
std::map< uint64_t, StageMap< ScheduleFeatures > > features
Definition: LoopNest.h:140
int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const
const FunctionDAG::Node * node
Definition: LoopNest.h:57
void dump(T &stream, string prefix, const LoopNest *parent) const
int64_t product_of_self_and_descendants(int loop_index) const
bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
void inline_func(const FunctionDAG::Node *f)
void generate_vec_dim_serial_tilings(vector< int > &serial_sizes) const
bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const
void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType< T > &mem_info, bool verbose=false) const
void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector< int64_t > &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose=false) const
void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType< T > &mem_info, double serial_loop_extents, bool verbose=false) const
int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const
int64_t product_of_descendants(int loop_index) const
bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const
bool is_gpu_block(const Target &target) const
Definition: LoopNest.h:150
bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const
bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector< const FunctionDAG::Edge * > &edge_chain) const
int64_t get_total_constant_local_mem_alloc_size() const
const Bound & get_bounds(const FunctionDAG::Node *f) const
std::pair< double, double > compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const
int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const
void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const
vector< int64_t > get_union_thread_counts(const FunctionDAG::Node *f) const
MemInfoType< T > compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const
void collect_nodes_that_should_be_inlined(const NodeMap< bool > &nodes_to_freeze, NodeMap< bool > &inlined_nodes) const
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState > > &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site, const Target &target, std::vector< StageScheduleState * > &ancestors, const NodeMap< bool > &all_inlined) const
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates
Definition: LoopNest.h:139
int get_actual_vector_dim(const Bound &store_bounds) const
void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const
int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const
bool computes(const FunctionDAG::Node *f) const
double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const
std::vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:42
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
Definition: LoopNest.h:260
bool promote_allocs_to_registers(const Target &target, StageMap< Sites > &sites) const
std::pair< int64_t, int64_t > get_block_and_serial_extents(const LoopNest *block) const
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
Definition: LoopNest.h:379
bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
bool has_constant_region_computed(const FunctionDAG::Node *node) const
double compute_local_mem_stride(double stride, double bytes) const
bool add_gpu_thread_tilings(const FunctionDAG::Node *f, const Anderson2021Params &params, const Target &target, int v, vector< IntrusivePtr< const LoopNest > > &result, const vector< int64_t > &max_size)
IntrusivePtr< const LoopNest > parallelize_in_tiles(const vector< int64_t > &tiling, const LoopNest *parent, const Anderson2021Params &params, const Target &target, bool inner_tiling, bool adjust_tiling, bool move_all_rvars_inward=true, const vector< int > &rvars_to_move_inward={}) const
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
void collect_all_inlined(NodeMap< bool > &all_inlined) const
Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose=false) const
std::vector< int > unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const
bool is_gpu_serial(const Target &target) const
Definition: LoopNest.h:142
double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const
static void hash_combine(uint64_t &h, uint64_t next)
Definition: LoopNest.h:187
const LoopNest * find_pure_stage_loop_nest(const FunctionDAG::Node *node) const
void dump(std::ostream &os, string prefix, const LoopNest *parent) const
int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector< const FunctionDAG::Edge * > &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
void get_stage_sizes(const FunctionDAG::Node *f, vector< vector< int64_t > > &stage_sizes, vector< vector< int > > &pure_dims, vector< int > &vectorized_indices) const
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
const FunctionDAG::Node::Stage * stage
Definition: LoopNest.h:60
bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const
void get_stages_computed_in_each_compute_root_loop(StageMap< StageMap< bool > > &descendants, const LoopNest *compute_root_loop_nest=nullptr) const
void compute_features(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, GPULoopInfo gpu_loop_info, bool use_memoized_features, const StageMap< int64_t > &total_shared_mem_alloc_sizes, int64_t *working_set, int64_t *working_set_local_constant, int64_t *working_set_local_dynamic, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
void get_sites(const Target &target, StageMap< Sites > &sites, StageMap< int64_t > &shared_mem_alloc_sizes, const LoopNest *task=nullptr, const LoopNest *parent=nullptr, const LoopNest *current_thread_loop=nullptr) const
bool calls(const FunctionDAG::Node *f) const
bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const
void copy_from_including_features(const LoopNest &n)
bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const
void update_producers_to_be_staged(StageScheduleState &state, const NodeMap< bool > &all_inlined) const
bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const
vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:89
void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap< Sites > &sites, NodeMap< bool > &can_be_promoted_to_registers, const LoopNest *grandparent, const LoopNest *parent) const
void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const
void structural_hash(uint64_t &h, int depth) const
std::pair< const LoopNest *, const LoopNest * > find_innermost_and_parent() const
std::set< const FunctionDAG::Node * > store_at
Definition: LoopNest.h:49
std::pair< int64_t, bool > compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const
bool compute_here(const FunctionDAG::Node *f, bool tileable, int v, bool in_threads_loop, const Anderson2021Params &params, const Target &target)
int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
int vectorized_access_size(size_t loop_index, bool verbose=false) const
A sequence of statements to be executed in-order.
Definition: IR.h:434
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition: IntrusivePtr.h:68
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
A class that can represent Vars or RVars.
Definition: Func.h:30