#include "flang/ISO_Fortran_binding_wrapper.h"
#include "flang/Optimizer/Builder/BoxValue.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/Runtime/Inquiry.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
#include "flang/Optimizer/Dialect/Support/KindMapping.h"
#include "flang/Optimizer/Support/DataLayout.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Dominance.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include "flang/Optimizer/Transforms/Passes.h.inc"

Namespaces
namespace	fir

Macros
#define	GEN_PASS_DEF_LOOPVERSIONING

#define	DEBUG_TYPE "flang-loop-versioning"

Detailed Description

This pass looks for loops iterating over assumed-shape arrays, that can be optimized by "guessing" that the stride is element-sized.

This is done by creating two versions of the same loop: one which assumes that the elements are contiguous (stride == size of element), and one that is the original generic loop.

As a side-effect of the assumed element size stride, the array is also flattened to make it a 1D array - this is because the internal array structure must be either 1D or have known sizes in all dimensions - and at least one of the dimensions here is already unknown.

There are two distinct benefits here:

The loop that iterates over the elements is somewhat simplified by the constant stride calculation.
Since the compiler can understand the size of the stride, it can use vector instructions, where an unknown (at compile time) stride does often prevent vector operations from being used.

A known drawback is that the code-size is increased, in some cases that can be quite substantial - 3-4x is quite plausible (this includes that the loop gets vectorized, which in itself often more than doubles the size of the code, because unless the loop size is known, there will be a modulo vector-size remainder to deal with.

TODO: Do we need some size limit where loops no longer get duplicated?

Namespaces

Macros

Detailed Description