llvm-project/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
; RUN: opt -S -passes=openmpopt < %s | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"

; FIXME: This struct should be generated after splitting at least one of the runtime calls.
; %struct.__tgt_async_info = type { i8* }
%struct.ident_t = type { i32, i32, i32, i32, i8* }
%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }

@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35]
@.__omp_offloading_heavyComputation1.region_id = weak constant i8 0
@.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 8]
@.offload_maptypes.2 = private unnamed_addr constant [1 x i64] [i64 800]

@.__omp_offloading_heavyComputation2.region_id = weak constant i8 0
@.offload_maptypes.3 = private unnamed_addr constant [2 x i64] [i64 35, i64 35]

@.__omp_offloading_heavyComputation3.region_id = weak constant i8 0
@.offload_sizes.2 = private unnamed_addr constant [2 x i64] [i64 4, i64 0]
@.offload_maptypes.4 = private unnamed_addr constant [2 x i64] [i64 800, i64 544]

@.offload_maptypes.5 = private unnamed_addr constant [1 x i64] [i64 33]

;double heavyComputation1() {
;  double a = rand() % 777;
;  double random = rand();
;
;  //#pragma omp target data map(a)
;  void* args[1];
;  args[0] = &a;
;  __tgt_target_data_begin(..., args, ...)
;
;  #pragma omp target teams
;  for (int i = 0; i < 1000; ++i) {
;    a *= i*i / 2;
;  }
;
;  return random + a;
;}
define dso_local double @heavyComputation1() {
; CHECK-LABEL: define {{[^@]+}}@heavyComputation1()
; CHECK-NEXT:  entry:
; CHECK-NEXT:    %a = alloca double, align 8
; CHECK-NEXT:    %.offload_baseptrs = alloca [1 x i8*], align 8
; CHECK-NEXT:    %.offload_ptrs = alloca [1 x i8*], align 8
; CHECK-NEXT:    %.offload_baseptrs4 = alloca [1 x i8*], align 8
; CHECK-NEXT:    %.offload_ptrs5 = alloca [1 x i8*], align 8
; CHECK-NEXT:    %0 = bitcast double* %a to i8*
; CHECK-NEXT:    %call = call i32 @rand()
; CHECK-NEXT:    %rem = srem i32 %call, 777
; CHECK-NEXT:    %conv = sitofp i32 %rem to double
; CHECK-NEXT:    store double %conv, double* %a, align 8
; CHECK-NEXT:    %call1 = call i32 @rand()
; CHECK-NEXT:    %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
; CHECK-NEXT:    %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
; CHECK-NEXT:    store double* %a, double** %2, align 8
; CHECK-NEXT:    %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
; CHECK-NEXT:    %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
; CHECK-NEXT:    store double* %a, double** %4, align 8
; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
; CHECK-NEXT:    %5 = bitcast double* %a to i64*
; CHECK-NEXT:    %6 = load i64, i64* %5, align 8
; CHECK-NEXT:    %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
; CHECK-NEXT:    %8 = bitcast [1 x i8*]* %.offload_baseptrs4 to i64*
; CHECK-NEXT:    store i64 %6, i64* %8, align 8
; CHECK-NEXT:    %9 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs5, i64 0, i64 0
; CHECK-NEXT:    %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64*
; CHECK-NEXT:    store i64 %6, i64* %10, align 8
; CHECK-NEXT:    %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0)
; CHECK-NEXT:    %12 = icmp eq i32 %11, 0
; CHECK-NEXT:    br i1 %12, label %omp_offload.cont, label %omp_offload.failed
; CHECK:       omp_offload.failed:
; CHECK-NEXT:    call void @heavyComputation1FallBack(i64 %6)
; CHECK-NEXT:    br label %omp_offload.cont
; CHECK:       omp_offload.cont:
; CHECK-NEXT:    %conv2 = sitofp i32 %call1 to double
; CHECK-NEXT:    call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
; CHECK-NEXT:    %13 = load double, double* %a, align 8
; CHECK-NEXT:    %add = fadd double %13, %conv2
; CHECK-NEXT:    ret double %add
;
entry:
  %a = alloca double, align 8
  %.offload_baseptrs = alloca [1 x i8*], align 8
  %.offload_ptrs = alloca [1 x i8*], align 8
  %.offload_baseptrs4 = alloca [1 x i8*], align 8
  %.offload_ptrs5 = alloca [1 x i8*], align 8

  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
  ; %device_id1 = alloca i64, align 8
  ; %async_info1 = alloca %struct.__tgt_async_info, align 8

  %0 = bitcast double* %a to i8*
  %call = call i32 @rand()
  %rem = srem i32 %call, 777
  %conv = sitofp i32 %rem to double
  store double %conv, double* %a, align 8

  ; FIXME: The "isue" should be moved here.
  %call1 = call i32 @rand()

  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
  ;        split into its "issue" and "wait" counterpars and moved upwards
  ;        and downwards, respectively. The call should be replaced to something
  ;        like ...
  ; Issue - this is moved upwards.
  ; ... setup code ...
  ; store i64 -1, i64* %device_id1, align 8
  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
  ; Wait - this is moved downwards.
  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
  ; store i8* %handle1, i8** %queue1, align 8
  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
  %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
  %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
  store double* %a, double** %2, align 8
  %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
  %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
  store double* %a, double** %4, align 8
  call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))

  %5 = bitcast double* %a to i64*
  %6 = load i64, i64* %5, align 8
  %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
  %8 = bitcast [1 x i8*]* %.offload_baseptrs4 to i64*
  store i64 %6, i64* %8, align 8
  %9 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs5, i64 0, i64 0
  %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64*
  store i64 %6, i64* %10, align 8

  ; FIXME: The "wait" should be moved here.
  %11 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i32 0, i32 0)

  %12 = icmp eq i32 %11, 0
  br i1 %12, label %omp_offload.cont, label %omp_offload.failed

omp_offload.failed:                               ; preds = %entry
  call void @heavyComputation1FallBack(i64 %6)
  br label %omp_offload.cont

omp_offload.cont:                                 ; preds = %entry, %omp_offload.failed
  %conv2 = sitofp i32 %call1 to double
  call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0))
  %13 = load double, double* %a, align 8
  %add = fadd double %13, %conv2
  ret double %add
}

define internal void @heavyComputation1FallBack(i64 %a) {
entry:
  ; Fallback for offloading function heavyComputation1.
  ret void
}

;int heavyComputation2(double* a, unsigned size) {
;  int random = rand() % 7;
;
;  //#pragma omp target data map(a[0:size], size)
;  void* args[2];
;  args[0] = &a;
;  args[1] = &size;
;  __tgt_target_data_begin(..., args, ...)
;
;  #pragma omp target teams
;  for (int i = 0; i < size; ++i) {
;    a[i] = ++a[i] * 3.141624;
;  }
;
;  return random;
;}
define dso_local i32 @heavyComputation2(double* %a, i32 %size) {
; CHECK-LABEL: define {{[^@]+}}@heavyComputation2(double* %a, i32 %size)
; CHECK-NEXT:  entry:
; CHECK-NEXT:    %size.addr = alloca i32, align 4
; CHECK-NEXT:    %.offload_baseptrs = alloca [2 x i8*], align 8
; CHECK-NEXT:    %.offload_ptrs = alloca [2 x i8*], align 8
; CHECK-NEXT:    %.offload_sizes = alloca [2 x i64], align 8
; CHECK-NEXT:    %.offload_baseptrs2 = alloca [2 x i8*], align 8
; CHECK-NEXT:    %.offload_ptrs3 = alloca [2 x i8*], align 8
; CHECK-NEXT:    store i32 %size, i32* %size.addr, align 4
; CHECK-NEXT:    %call = call i32 @rand()
; CHECK-NEXT:    %conv = zext i32 %size to i64
; CHECK-NEXT:    %0 = shl nuw nsw i64 %conv, 3
; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
; CHECK-NEXT:    %2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
; CHECK-NEXT:    store double* %a, double** %2, align 8
; CHECK-NEXT:    %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
; CHECK-NEXT:    %4 = bitcast [2 x i8*]* %.offload_ptrs to double**
; CHECK-NEXT:    store double* %a, double** %4, align 8
; CHECK-NEXT:    %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
; CHECK-NEXT:    store i64 %0, i64* %5, align 8
; CHECK-NEXT:    %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
; CHECK-NEXT:    %7 = bitcast i8** %6 to i32**
; CHECK-NEXT:    store i32* %size.addr, i32** %7, align 8
; CHECK-NEXT:    %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
; CHECK-NEXT:    %9 = bitcast i8** %8 to i32**
; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
; CHECK-NEXT:    store i64 4, i64* %10, align 8
; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
; CHECK-NEXT:    %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64*
; CHECK-NEXT:    store i64 %size.casted, i64* %13, align 8
; CHECK-NEXT:    %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0
; CHECK-NEXT:    %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64*
; CHECK-NEXT:    store i64 %size.casted, i64* %15, align 8
; CHECK-NEXT:    %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1
; CHECK-NEXT:    %17 = bitcast i8** %16 to double**
; CHECK-NEXT:    store double* %a, double** %17, align 8
; CHECK-NEXT:    %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
; CHECK-NEXT:    %19 = bitcast i8** %18 to double**
; CHECK-NEXT:    store double* %a, double** %19, align 8
; CHECK-NEXT:    %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
; CHECK-NEXT:    %21 = icmp eq i32 %20, 0
; CHECK-NEXT:    br i1 %21, label %omp_offload.cont, label %omp_offload.failed
; CHECK:       omp_offload.failed:
; CHECK-NEXT:    call void @heavyComputation2FallBack(i64 %size.casted, double* %a)
; CHECK-NEXT:    br label %omp_offload.cont
; CHECK:       omp_offload.cont:
; CHECK-NEXT:    %rem = srem i32 %call, 7
; CHECK-NEXT:    call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
; CHECK-NEXT:    ret i32 %rem
;
entry:
  %size.addr = alloca i32, align 4
  %.offload_baseptrs = alloca [2 x i8*], align 8
  %.offload_ptrs = alloca [2 x i8*], align 8
  %.offload_sizes = alloca [2 x i64], align 8
  %.offload_baseptrs2 = alloca [2 x i8*], align 8
  %.offload_ptrs3 = alloca [2 x i8*], align 8

  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
  ; %device_id1 = alloca i64, align 8
  ; %async_info1 = alloca %struct.__tgt_async_info, align 8

  store i32 %size, i32* %size.addr, align 4
  %call = call i32 @rand()

  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
  ;        split into its "issue" and "wait" counterpars. Here though, the "issue"
  ;        cannot be moved upwards because it's not guaranteed that rand()
  ;        won't modify *a. Nevertheless, the "wait" can be moved downwards.
  ;        The call should be replaced to something like ...
  ; Issue - this can't be moved upwards, *a might have aliases.
  ; ... setup code ...
  ; store i64 -1, i64* %device_id1, align 8
  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
  ; Wait - this is moved downards.
  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
  ; store i8* %handle1, i8** %queue1, align 8
  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
  %conv = zext i32 %size to i64
  %0 = shl nuw nsw i64 %conv, 3
  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
  %2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
  store double* %a, double** %2, align 8
  %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
  %4 = bitcast [2 x i8*]* %.offload_ptrs to double**
  store double* %a, double** %4, align 8
  %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
  store i64 %0, i64* %5, align 8
  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
  %7 = bitcast i8** %6 to i32**
  store i32* %size.addr, i32** %7, align 8
  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
  %9 = bitcast i8** %8 to i32**
  store i32* %size.addr, i32** %9, align 8
  %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
  store i64 4, i64* %10, align 8
  call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))

  %11 = load i32, i32* %size.addr, align 4
  %size.casted = zext i32 %11 to i64
  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
  %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64*
  store i64 %size.casted, i64* %13, align 8
  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0
  %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64*
  store i64 %size.casted, i64* %15, align 8
  %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1
  %17 = bitcast i8** %16 to double**
  store double* %a, double** %17, align 8
  %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
  %19 = bitcast i8** %18 to double**
  store double* %a, double** %19, align 8

  ; FIXME: The "wait" should be moved here.
  %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)

  %21 = icmp eq i32 %20, 0
  br i1 %21, label %omp_offload.cont, label %omp_offload.failed

omp_offload.failed:                               ; preds = %entry
  call void @heavyComputation2FallBack(i64 %size.casted, double* %a)
  br label %omp_offload.cont

omp_offload.cont:                                 ; preds = %entry, %omp_offload.failed
  %rem = srem i32 %call, 7
  call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
  ret i32 %rem
}

define internal void @heavyComputation2FallBack(i64 %size, double* %a) {
entry:
  ; Fallback for offloading function heavyComputation2.
  ret void
}

;int heavyComputation3(double* restrict a, unsigned size) {
;  int random = rand() % 7;
;
;  //#pragma omp target data map(a[0:size], size)
;  void* args[2];
;  args[0] = &a;
;  args[1] = &size;
;  __tgt_target_data_begin(..., args, ...)
;
;  #pragma omp target teams
;  for (int i = 0; i < size; ++i) {
;    a[i] = ++a[i] * 3.141624;
;  }
;
;  return random;
;}
define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) {
; CHECK-LABEL: define {{[^@]+}}@heavyComputation3(double* noalias %a, i32 %size)
; CHECK-NEXT:  entry:
; CHECK-NEXT:    %size.addr = alloca i32, align 4
; CHECK-NEXT:    %.offload_baseptrs = alloca [2 x i8*], align 8
; CHECK-NEXT:    %.offload_ptrs = alloca [2 x i8*], align 8
; CHECK-NEXT:    %.offload_sizes = alloca [2 x i64], align 8
; CHECK-NEXT:    %.offload_baseptrs2 = alloca [2 x i8*], align 8
; CHECK-NEXT:    %.offload_ptrs3 = alloca [2 x i8*], align 8
; CHECK-NEXT:    store i32 %size, i32* %size.addr, align 4
; CHECK-NEXT:    %call = call i32 @rand()
; CHECK-NEXT:    %conv = zext i32 %size to i64
; CHECK-NEXT:    %0 = shl nuw nsw i64 %conv, 3
; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
; CHECK-NEXT:    %2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
; CHECK-NEXT:    store double* %a, double** %2, align 8
; CHECK-NEXT:    %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
; CHECK-NEXT:    %4 = bitcast [2 x i8*]* %.offload_ptrs to double**
; CHECK-NEXT:    store double* %a, double** %4, align 8
; CHECK-NEXT:    %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
; CHECK-NEXT:    store i64 %0, i64* %5, align 8
; CHECK-NEXT:    %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
; CHECK-NEXT:    %7 = bitcast i8** %6 to i32**
; CHECK-NEXT:    store i32* %size.addr, i32** %7, align 8
; CHECK-NEXT:    %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
; CHECK-NEXT:    %9 = bitcast i8** %8 to i32**
; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
; CHECK-NEXT:    store i64 4, i64* %10, align 8
; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
; CHECK-NEXT:    %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64*
; CHECK-NEXT:    store i64 %size.casted, i64* %13, align 8
; CHECK-NEXT:    %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0
; CHECK-NEXT:    %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64*
; CHECK-NEXT:    store i64 %size.casted, i64* %15, align 8
; CHECK-NEXT:    %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1
; CHECK-NEXT:    %17 = bitcast i8** %16 to double**
; CHECK-NEXT:    store double* %a, double** %17, align 8
; CHECK-NEXT:    %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
; CHECK-NEXT:    %19 = bitcast i8** %18 to double**
; CHECK-NEXT:    store double* %a, double** %19, align 8
; CHECK-NEXT:    %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)
; CHECK-NEXT:    %21 = icmp eq i32 %20, 0
; CHECK-NEXT:    br i1 %21, label %omp_offload.cont, label %omp_offload.failed
; CHECK:       omp_offload.failed:
; CHECK-NEXT:    call void @heavyComputation3FallBack(i64 %size.casted, double* %a)
; CHECK-NEXT:    br label %omp_offload.cont
; CHECK:       omp_offload.cont:
; CHECK-NEXT:    %rem = srem i32 %call, 7
; CHECK-NEXT:    call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
; CHECK-NEXT:    ret i32 %rem
;
entry:
  %size.addr = alloca i32, align 4
  %.offload_baseptrs = alloca [2 x i8*], align 8
  %.offload_ptrs = alloca [2 x i8*], align 8
  %.offload_sizes = alloca [2 x i64], align 8
  %.offload_baseptrs2 = alloca [2 x i8*], align 8
  %.offload_ptrs3 = alloca [2 x i8*], align 8

  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
  ; %device_id1 = alloca i64, align 8
  ; %async_info1 = alloca %struct.__tgt_async_info, align 8

  store i32 %size, i32* %size.addr, align 4

  ; FIXME: The "issue" should be moved here.
  %call = call i32 @rand()

  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
  ;        split into its "issue" and "wait" counterpars and moved upwards
  ;        and downwards, respectively. The call should be replaced to something
  ;        like ...
  ; Issue - this is moved upwards.
  ; ... setup code ...
  ; store i64 -1, i64* %device_id1, align 8
  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
  ; Wait - this is moved downards.
  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
  ; store i8* %handle1, i8** %queue1, align 8
  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
  %conv = zext i32 %size to i64
  %0 = shl nuw nsw i64 %conv, 3
  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
  %2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
  store double* %a, double** %2, align 8
  %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
  %4 = bitcast [2 x i8*]* %.offload_ptrs to double**
  store double* %a, double** %4, align 8
  %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
  store i64 %0, i64* %5, align 8
  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
  %7 = bitcast i8** %6 to i32**
  store i32* %size.addr, i32** %7, align 8
  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
  %9 = bitcast i8** %8 to i32**
  store i32* %size.addr, i32** %9, align 8
  %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
  store i64 4, i64* %10, align 8
  call void @__tgt_target_data_begin(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))

  %11 = load i32, i32* %size.addr, align 4
  %size.casted = zext i32 %11 to i64
  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
  %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64*
  store i64 %size.casted, i64* %13, align 8
  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0
  %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64*
  store i64 %size.casted, i64* %15, align 8
  %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1
  %17 = bitcast i8** %16 to double**
  store double* %a, double** %17, align 8
  %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1
  %19 = bitcast i8** %18 to double**
  store double* %a, double** %19, align 8

  ; FIXME: The "wait" should be moved here.
  %20 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i32 0, i32 0)

  %21 = icmp eq i32 %20, 0
  br i1 %21, label %omp_offload.cont, label %omp_offload.failed

omp_offload.failed:                               ; preds = %entry
  call void @heavyComputation3FallBack(i64 %size.casted, double* %a)
  br label %omp_offload.cont

omp_offload.cont:                                 ; preds = %entry, %omp_offload.failed
  %rem = srem i32 %call, 7
  call void @__tgt_target_data_end(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0))
  ret i32 %rem
}

define internal void @heavyComputation3FallBack(i64 %size, double* %a) {
entry:
  ; Fallback for offloading function heavyComputation3.
  ret void
}

;int dataTransferOnly1(double* restrict a, unsigned size) {
;  // Random computation.
;  int random = rand();
;
;  //#pragma omp target data map(to:a[0:size])
;  void* args[1];
;  args[0] = &a;
;  __tgt_target_data_begin(..., args, ...)
;
;  // Random computation.
;  random %= size;
;  return random;
;}
define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) {
; CHECK-LABEL: define {{[^@]+}}@dataTransferOnly1(double* noalias %a, i32 %size)
; CHECK-NEXT:  entry:
; CHECK-NEXT:    %.offload_baseptrs = alloca [1 x i8*], align 8
; CHECK-NEXT:    %.offload_ptrs = alloca [1 x i8*], align 8
; CHECK-NEXT:    %.offload_sizes = alloca [1 x i64], align 8
; CHECK-NEXT:    %call = call i32 @rand()
; CHECK-NEXT:    %conv = zext i32 %size to i64
; CHECK-NEXT:    %0 = shl nuw nsw i64 %conv, 3
; CHECK-NEXT:    %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
; CHECK-NEXT:    %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
; CHECK-NEXT:    store double* %a, double** %2, align 8
; CHECK-NEXT:    %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
; CHECK-NEXT:    %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
; CHECK-NEXT:    store double* %a, double** %4, align 8
; CHECK-NEXT:    %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
; CHECK-NEXT:    store i64 %0, i64* %5, align 8
; CHECK-NEXT:    call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
; CHECK-NEXT:    %rem = urem i32 %call, %size
; CHECK-NEXT:    call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
; CHECK-NEXT:    ret i32 %rem
;
entry:
  %.offload_baseptrs = alloca [1 x i8*], align 8
  %.offload_ptrs = alloca [1 x i8*], align 8
  %.offload_sizes = alloca [1 x i64], align 8

  ; FIXME: Should have after splitting the runtime call __tgt_target_data_begin.
  ; %device_id1 = alloca i64, align 8
  ; %async_info1 = alloca %struct.__tgt_async_info, align 8

  ; FIXME: The "issue" should be moved here.
  %call = call i32 @rand()

  ; FIXME: This setup for the runtime call __tgt_target_data_begin should be
  ;        split into its "issue" and "wait" counterpars and moved upwards
  ;        and downwards, respectively. The call should be replaced to something
  ;        like ...
  ; Issue - this is moved upwards.
  ; ... setup code ...
  ; store i64 -1, i64* %device_id1, align 8
  ; %handle1 = call i8* @__tgt_target_data_begin(i64* dereferenceable(8) %device_id1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
  ; Wait - this is moved downards.
  ; %device_id1_copy = load i64, i64* %device_id1, align 8 ; device_id
  ; %queue1 = getelementptr inbounds %struct.__tgt_async_info, %struct.__tgt_async_info* %async_info1, i32 0, i32 0
  ; store i8* %handle1, i8** %queue1, align 8
  ; call void @__tgt_target_data_begin_wait(i64 %device_id1_copy, %struct.__tgt_async_info* dereferenceable(8) %async_info1)
  %conv = zext i32 %size to i64
  %0 = shl nuw nsw i64 %conv, 3
  %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
  %2 = bitcast [1 x i8*]* %.offload_baseptrs to double**
  store double* %a, double** %2, align 8
  %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
  %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
  store double* %a, double** %4, align 8
  %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
  store i64 %0, i64* %5, align 8
  call void @__tgt_target_data_begin(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))

  %rem = urem i32 %call, %size

  ; FIXME: The "wait" should be moved here.
  call void @__tgt_target_data_end(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0))
  ret i32 %rem
}

declare dso_local void @__tgt_target_data_begin(i64, i32, i8**, i8**, i64*, i64*)

declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32)

declare dso_local void @__tgt_target_data_end(i64, i32, i8**, i8**, i64*, i64*)

declare dso_local i32 @rand()

; FIXME: These two function declarations must be generated after splitting the runtime function
;        __tgt_target_data_begin.
; declare dso_local i8* @__tgt_target_data_begin_issue(i64* dereferenceable(8), i32, i8**, i8**, i64*, i64*)
; declare dso_local void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info* dereferenceable(8))