llvm-project/polly/test/GPGPU/kernel-params-only-some-arr...

; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=KERNEL %s

; RUN: opt %loadPolly -polly-codegen-ppcg \
; RUN: -S < %s | \
; RUN: FileCheck -check-prefix=IR %s

; REQUIRES: pollyacc
;
;    void kernel_params_only_some_arrays(float A[], float B[]) {
;      for (long i = 0; i < 32; i++)
;        A[i] += 42;
;
;      for (long i = 0; i < 32; i++)
;        B[i] += 42;
;    }

; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0'
; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0"
; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"

; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A)
; KERNEL-NEXT:   entry:
; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
; KERNEL-NEXT:     %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; KERNEL-NEXT:     %t0 = zext i32 %1 to i64

; KERNEL:     ret void
; KERNEL-NEXT: }

; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1'
; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1"
; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"

; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_B)
; KERNEL-NEXT:   entry:
; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
; KERNEL-NEXT:     %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; KERNEL-NEXT:     %t0 = zext i32 %1 to i64

; KERNEL:     ret void
; KERNEL-NEXT: }


; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
; IR-NEXT:  [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
; IR-NEXT:  store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
; IR-NEXT:  [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
; IR-NEXT:  store i8* [[DATA]], i8** [[SLOT]]

; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
; IR-NEXT:  [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0
; IR-NEXT:  store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
; IR-NEXT:  [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8*
; IR-NEXT:  store i8* [[DATA]], i8** [[SLOT]]

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

define void @kernel_params_only_some_arrays(float* %A, float* %B) {
entry:
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ]
  %exitcond1 = icmp ne i64 %i.0, 32
  br i1 %exitcond1, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %arrayidx = getelementptr inbounds float, float* %A, i64 %i.0
  %tmp = load float, float* %arrayidx, align 4
  %add = fadd float %tmp, 4.200000e+01
  store float %add, float* %arrayidx, align 4
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %inc = add nuw nsw i64 %i.0, 1
  br label %for.cond

for.end:                                          ; preds = %for.cond
  br label %for.cond2

for.cond2:                                        ; preds = %for.inc7, %for.end
  %i1.0 = phi i64 [ 0, %for.end ], [ %inc8, %for.inc7 ]
  %exitcond = icmp ne i64 %i1.0, 32
  br i1 %exitcond, label %for.body4, label %for.end9

for.body4:                                        ; preds = %for.cond2
  %arrayidx5 = getelementptr inbounds float, float* %B, i64 %i1.0
  %tmp2 = load float, float* %arrayidx5, align 4
  %add6 = fadd float %tmp2, 4.200000e+01
  store float %add6, float* %arrayidx5, align 4
  br label %for.inc7

for.inc7:                                         ; preds = %for.body4
  %inc8 = add nuw nsw i64 %i1.0, 1
  br label %for.cond2

for.end9:                                         ; preds = %for.cond2
  ret void
}
GPGPU: create kernel function skeleton Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952 2016-07-19 15:32:38 +08:00			`; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \`
			`; RUN: -disable-output < %s \| \`
			`; RUN: FileCheck -check-prefix=KERNEL %s`
GPGPU: use current 'Index' to find slot in parameter array Before this change we used the array index, which would result in us accessing the parameter array out-of-bounds. This bug was visible for test cases where not all arrays in a scop are passed to a given kernel. llvm-svn: 276961 2016-07-28 14:47:53 +08:00
			`; RUN: opt %loadPolly -polly-codegen-ppcg \`
			`; RUN: -S < %s \| \`
			`; RUN: FileCheck -check-prefix=IR %s`

test: Add missing 'REQUIRES' line llvm-svn: 275960 2016-07-19 15:39:54 +08:00			`; REQUIRES: pollyacc`
			`;`
GPGPU: create kernel function skeleton Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952 2016-07-19 15:32:38 +08:00			`; void kernel_params_only_some_arrays(float A[], float B[]) {`
			`; for (long i = 0; i < 32; i++)`
			`; A[i] += 42;`
			`;`
			`; for (long i = 0; i < 32; i++)`
			`; B[i] += 42;`
			`; }`

[PPCGCodeGen] Differentiate kernels based on their parent Scop Summary: Add a sequence number that identifies a ptx_kernel's parent Scop within a function to it's name to differentiate it from other kernels produced from the same function, yet different Scops. Kernels produced from different Scops can end up having the same name. Consider a function with 2 Scops and each Scop being able to produce just one kernel. Both of these kernels have the name "kernel_0". This can lead to the wrong kernel being launched when the runtime picks a kernel from its cache based on the name alone. This patch supplements D33985, by differentiating kernels across Scops as well. Previously (even before D33985) while profiling kernels generated through JIT e.g. Julia, [[ https://groups.google.com/d/msg/polly-dev/J1j587H3-Qw/mR-jfL16BgAJ \| kernels associated with different functions, and even different SCoPs within a function, would be grouped together due to the common name ]]. This patch prevents this grouping and the kernels are reported separately. Reviewers: grosser, bollu Reviewed By: grosser Subscribers: mehdi_amini, nemanjai, pollydev, kbarton Tags: #polly Differential Revision: https://reviews.llvm.org/D35176 llvm-svn: 307814 2017-07-13 00:46:19 +08:00			`; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0'`
			`; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0"`
[PPCGCodeGeneration] Update PPCG Code Generation for OpenCL compatibility Added a small change to the way pointer arguments are set in the kernel code generation. The way the pointer is retrieved now, specifically requests global address space to be annotated. This is necessary, if the IR should be run through NVPTX to generate OpenCL compatible PTX. The changes do not affect the PTX Strings generated for the CUDA target (nvptx64-nvidia-cuda), but are necessary for OpenCL (nvptx64-nvidia-nvcl). Additionally, the data layout has been updated to what the NVPTX Backend requests/recommends. Contributed-by: Philipp Schaad Reviewers: Meinersbur, grosser, bollu Reviewed By: grosser, bollu Subscribers: jlebar, pollydev, llvm-commits, nemanjai, yaxunl, Anastasia Tags: #polly Differential Revision: https://reviews.llvm.org/D32215 llvm-svn: 301299 2017-04-25 16:08:29 +08:00			`; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"`
GPGPU: create kernel function skeleton Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952 2016-07-19 15:32:38 +08:00			`; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"`

[PPCGCodeGen] Differentiate kernels based on their parent Scop Summary: Add a sequence number that identifies a ptx_kernel's parent Scop within a function to it's name to differentiate it from other kernels produced from the same function, yet different Scops. Kernels produced from different Scops can end up having the same name. Consider a function with 2 Scops and each Scop being able to produce just one kernel. Both of these kernels have the name "kernel_0". This can lead to the wrong kernel being launched when the runtime picks a kernel from its cache based on the name alone. This patch supplements D33985, by differentiating kernels across Scops as well. Previously (even before D33985) while profiling kernels generated through JIT e.g. Julia, [[ https://groups.google.com/d/msg/polly-dev/J1j587H3-Qw/mR-jfL16BgAJ \| kernels associated with different functions, and even different SCoPs within a function, would be grouped together due to the common name ]]. This patch prevents this grouping and the kernels are reported separately. Reviewers: grosser, bollu Reviewed By: grosser Subscribers: mehdi_amini, nemanjai, pollydev, kbarton Tags: #polly Differential Revision: https://reviews.llvm.org/D35176 llvm-svn: 307814 2017-07-13 00:46:19 +08:00			`; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A)`
GPGPU: create kernel function skeleton Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952 2016-07-19 15:32:38 +08:00			`; KERNEL-NEXT: entry:`
GPGPU: add intrinsic functions to obtain a kernels thread and block ids llvm-svn: 275953 2016-07-19 15:32:44 +08:00			`; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()`
			`; KERNEL-NEXT: %b0 = zext i32 %0 to i64`
			`; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()`
			`; KERNEL-NEXT: %t0 = zext i32 %1 to i64`
GPGPU: generate code for ScopStatements This change introduces the actual compute code in the GPU kernels. To ensure all values referenced from the statements in the GPU kernel are indeed available we scan all ScopStmts in the GPU kernel for references to llvm::Values that are not yet covered by already modeled outer loop iterators, parameters, or array base pointers and also pass these additional llvm::Values to the GPU kernel. For arrays used in the GPU kernel we introduce a new ScopArrayInfo object, which is referenced by the newly generated access functions within the GPU kernel and which is used to help with code generation. llvm-svn: 276270 2016-07-21 21:15:59 +08:00
			`; KERNEL: ret void`
GPGPU: create kernel function skeleton Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952 2016-07-19 15:32:38 +08:00			`; KERNEL-NEXT: }`

[PPCGCodeGen] Differentiate kernels based on their parent Scop Summary: Add a sequence number that identifies a ptx_kernel's parent Scop within a function to it's name to differentiate it from other kernels produced from the same function, yet different Scops. Kernels produced from different Scops can end up having the same name. Consider a function with 2 Scops and each Scop being able to produce just one kernel. Both of these kernels have the name "kernel_0". This can lead to the wrong kernel being launched when the runtime picks a kernel from its cache based on the name alone. This patch supplements D33985, by differentiating kernels across Scops as well. Previously (even before D33985) while profiling kernels generated through JIT e.g. Julia, [[ https://groups.google.com/d/msg/polly-dev/J1j587H3-Qw/mR-jfL16BgAJ \| kernels associated with different functions, and even different SCoPs within a function, would be grouped together due to the common name ]]. This patch prevents this grouping and the kernels are reported separately. Reviewers: grosser, bollu Reviewed By: grosser Subscribers: mehdi_amini, nemanjai, pollydev, kbarton Tags: #polly Differential Revision: https://reviews.llvm.org/D35176 llvm-svn: 307814 2017-07-13 00:46:19 +08:00			`; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1'`
			`; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1"`
[PPCGCodeGeneration] Update PPCG Code Generation for OpenCL compatibility Added a small change to the way pointer arguments are set in the kernel code generation. The way the pointer is retrieved now, specifically requests global address space to be annotated. This is necessary, if the IR should be run through NVPTX to generate OpenCL compatible PTX. The changes do not affect the PTX Strings generated for the CUDA target (nvptx64-nvidia-cuda), but are necessary for OpenCL (nvptx64-nvidia-nvcl). Additionally, the data layout has been updated to what the NVPTX Backend requests/recommends. Contributed-by: Philipp Schaad Reviewers: Meinersbur, grosser, bollu Reviewed By: grosser, bollu Subscribers: jlebar, pollydev, llvm-commits, nemanjai, yaxunl, Anastasia Tags: #polly Differential Revision: https://reviews.llvm.org/D32215 llvm-svn: 301299 2017-04-25 16:08:29 +08:00			`; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"`
GPGPU: create kernel function skeleton Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952 2016-07-19 15:32:38 +08:00			`; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"`

[PPCGCodeGen] Differentiate kernels based on their parent Scop Summary: Add a sequence number that identifies a ptx_kernel's parent Scop within a function to it's name to differentiate it from other kernels produced from the same function, yet different Scops. Kernels produced from different Scops can end up having the same name. Consider a function with 2 Scops and each Scop being able to produce just one kernel. Both of these kernels have the name "kernel_0". This can lead to the wrong kernel being launched when the runtime picks a kernel from its cache based on the name alone. This patch supplements D33985, by differentiating kernels across Scops as well. Previously (even before D33985) while profiling kernels generated through JIT e.g. Julia, [[ https://groups.google.com/d/msg/polly-dev/J1j587H3-Qw/mR-jfL16BgAJ \| kernels associated with different functions, and even different SCoPs within a function, would be grouped together due to the common name ]]. This patch prevents this grouping and the kernels are reported separately. Reviewers: grosser, bollu Reviewed By: grosser Subscribers: mehdi_amini, nemanjai, pollydev, kbarton Tags: #polly Differential Revision: https://reviews.llvm.org/D35176 llvm-svn: 307814 2017-07-13 00:46:19 +08:00			`; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_B)`
GPGPU: create kernel function skeleton Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952 2016-07-19 15:32:38 +08:00			`; KERNEL-NEXT: entry:`
GPGPU: add intrinsic functions to obtain a kernels thread and block ids llvm-svn: 275953 2016-07-19 15:32:44 +08:00			`; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()`
			`; KERNEL-NEXT: %b0 = zext i32 %0 to i64`
			`; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()`
			`; KERNEL-NEXT: %t0 = zext i32 %1 to i64`
GPGPU: generate code for ScopStatements This change introduces the actual compute code in the GPU kernels. To ensure all values referenced from the statements in the GPU kernel are indeed available we scan all ScopStmts in the GPU kernel for references to llvm::Values that are not yet covered by already modeled outer loop iterators, parameters, or array base pointers and also pass these additional llvm::Values to the GPU kernel. For arrays used in the GPU kernel we introduce a new ScopArrayInfo object, which is referenced by the newly generated access functions within the GPU kernel and which is used to help with code generation. llvm-svn: 276270 2016-07-21 21:15:59 +08:00
			`; KERNEL: ret void`
GPGPU: create kernel function skeleton Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952 2016-07-19 15:32:38 +08:00			`; KERNEL-NEXT: }`

GPGPU: use current 'Index' to find slot in parameter array Before this change we used the array index, which would result in us accessing the parameter array out-of-bounds. This bug was visible for test cases where not all arrays in a scop are passed to a given kernel. llvm-svn: 276961 2016-07-28 14:47:53 +08:00
GPGPU: Detect read-only scalar arrays ... and pass these by value rather than by reference. llvm-svn: 281837 2016-09-18 03:22:18 +08:00			`; IR: [[DEVPTR:%.]] = call i8 @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)`
[Polly][PPCGCodeGen] OpenCL now gets kernel argument size from PPCG CodeGen Summary: PPCGCodeGeneration now attaches the size of the kernel launch parameters at the end of the parameter list. For the existing CUDA Runtime, this gets ignored, but the OpenCL Runtime knows to check for kernel-argument size at the end of the parameter list. (The resulting parameters list is twice as long. This has been accounted for in the corresponding test cases). Reviewers: grosser, Meinersbur, bollu Reviewed By: bollu Subscribers: nemanjai, yaxunl, Anastasia, pollydev, llvm-commits Tags: #polly Differential Revision: https://reviews.llvm.org/D32961 llvm-svn: 302515 2017-05-09 18:45:52 +08:00			`; IR-NEXT: [[SLOT:%.]] = getelementptr [2 x i8], [2 x i8] %polly_launch_0_params, i64 0, i64 0`
GPGPU: Detect read-only scalar arrays ... and pass these by value rather than by reference. llvm-svn: 281837 2016-09-18 03:22:18 +08:00			`; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0`
GPGPU: use current 'Index' to find slot in parameter array Before this change we used the array index, which would result in us accessing the parameter array out-of-bounds. This bug was visible for test cases where not all arrays in a scop are passed to a given kernel. llvm-svn: 276961 2016-07-28 14:47:53 +08:00			`; IR-NEXT: [[DATA:%.]] = bitcast i8* %polly_launch_0_param_0 to i8*`
			`; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]`

GPGPU: Detect read-only scalar arrays ... and pass these by value rather than by reference. llvm-svn: 281837 2016-09-18 03:22:18 +08:00			`; IR: [[DEVPTR:%.]] = call i8 @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)`
[Polly][PPCGCodeGen] OpenCL now gets kernel argument size from PPCG CodeGen Summary: PPCGCodeGeneration now attaches the size of the kernel launch parameters at the end of the parameter list. For the existing CUDA Runtime, this gets ignored, but the OpenCL Runtime knows to check for kernel-argument size at the end of the parameter list. (The resulting parameters list is twice as long. This has been accounted for in the corresponding test cases). Reviewers: grosser, Meinersbur, bollu Reviewed By: bollu Subscribers: nemanjai, yaxunl, Anastasia, pollydev, llvm-commits Tags: #polly Differential Revision: https://reviews.llvm.org/D32961 llvm-svn: 302515 2017-05-09 18:45:52 +08:00			`; IR-NEXT: [[SLOT:%.]] = getelementptr [2 x i8], [2 x i8] %polly_launch_1_params, i64 0, i64 0`
GPGPU: Detect read-only scalar arrays ... and pass these by value rather than by reference. llvm-svn: 281837 2016-09-18 03:22:18 +08:00			`; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0`
GPGPU: use current 'Index' to find slot in parameter array Before this change we used the array index, which would result in us accessing the parameter array out-of-bounds. This bug was visible for test cases where not all arrays in a scop are passed to a given kernel. llvm-svn: 276961 2016-07-28 14:47:53 +08:00			`; IR-NEXT: [[DATA:%.]] = bitcast i8* %polly_launch_1_param_0 to i8*`
			`; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]`

GPGPU: create kernel function skeleton Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952 2016-07-19 15:32:38 +08:00			`target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"`

			`define void @kernel_params_only_some_arrays(float* %A, float* %B) {`
			`entry:`
			`br label %for.cond`

			`for.cond: ; preds = %for.inc, %entry`
			`%i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ]`
			`%exitcond1 = icmp ne i64 %i.0, 32`
			`br i1 %exitcond1, label %for.body, label %for.end`

			`for.body: ; preds = %for.cond`
			`%arrayidx = getelementptr inbounds float, float* %A, i64 %i.0`
			`%tmp = load float, float* %arrayidx, align 4`
			`%add = fadd float %tmp, 4.200000e+01`
			`store float %add, float* %arrayidx, align 4`
			`br label %for.inc`

			`for.inc: ; preds = %for.body`
			`%inc = add nuw nsw i64 %i.0, 1`
			`br label %for.cond`

			`for.end: ; preds = %for.cond`
			`br label %for.cond2`

			`for.cond2: ; preds = %for.inc7, %for.end`
			`%i1.0 = phi i64 [ 0, %for.end ], [ %inc8, %for.inc7 ]`
			`%exitcond = icmp ne i64 %i1.0, 32`
			`br i1 %exitcond, label %for.body4, label %for.end9`

			`for.body4: ; preds = %for.cond2`
			`%arrayidx5 = getelementptr inbounds float, float* %B, i64 %i1.0`
			`%tmp2 = load float, float* %arrayidx5, align 4`
			`%add6 = fadd float %tmp2, 4.200000e+01`
			`store float %add6, float* %arrayidx5, align 4`
			`br label %for.inc7`

			`for.inc7: ; preds = %for.body4`
			`%inc8 = add nuw nsw i64 %i1.0, 1`
			`br label %for.cond2`

			`for.end9: ; preds = %for.cond2`
			`ret void`
			`}`