forked from OSchip/llvm-project
126 lines
3.4 KiB
ArmAsm
126 lines
3.4 KiB
ArmAsm
//===----------------------Hexagon builtin routine ------------------------===//
|
|
//
|
|
// The LLVM Compiler Infrastructure
|
|
//
|
|
// This file is dual licensed under the MIT and the University of Illinois Open
|
|
// Source Licenses. See LICENSE.TXT for details.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// An optimized version of a memcpy which is equivalent to the following loop:
|
|
//
|
|
// volatile unsigned *dest;
|
|
// unsigned *src;
|
|
//
|
|
// for (i = 0; i < num_words; ++i)
|
|
// *dest++ = *src++;
|
|
//
|
|
// The corresponding C prototype for this function would be
|
|
// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
|
|
// const unsigned *src,
|
|
// unsigned num_words);
|
|
//
|
|
// *** Both dest and src must be aligned to 32-bit boundaries. ***
|
|
// The code does not perform any runtime checks for this, and will fail
|
|
// in bad ways if this requirement is not met.
|
|
//
|
|
// The "forward" in the name refers to the fact that the function copies
|
|
// the words going forward in memory. It is incorrect to use this function
|
|
// for cases where the original code copied words in any other order.
|
|
//
|
|
// *** This function is only for the use by the compiler. ***
|
|
// The only indended use is for the LLVM compiler to generate calls to
|
|
// this function, when a mem-copy loop, like the one above, is detected.
|
|
|
|
.text
|
|
|
|
// Inputs:
|
|
// r0: dest
|
|
// r1: src
|
|
// r2: num_words
|
|
|
|
.globl hexagon_memcpy_forward_vp4cp4n2
|
|
.balign 32
|
|
.type hexagon_memcpy_forward_vp4cp4n2,@function
|
|
hexagon_memcpy_forward_vp4cp4n2:
|
|
|
|
// Compute r3 to be the number of words remaining in the current page.
|
|
// At the same time, compute r4 to be the number of 32-byte blocks
|
|
// remaining in the page (for prefetch).
|
|
{
|
|
r3 = sub(##4096, r1)
|
|
r5 = lsr(r2, #3)
|
|
}
|
|
{
|
|
// The word count before end-of-page is in the 12 lowest bits of r3.
|
|
// (If the address in r1 was already page-aligned, the bits are 0.)
|
|
r3 = extractu(r3, #10, #2)
|
|
r4 = extractu(r3, #7, #5)
|
|
}
|
|
{
|
|
r3 = minu(r2, r3)
|
|
r4 = minu(r5, r4)
|
|
}
|
|
{
|
|
r4 = or(r4, ##2105344) // 2105344 = 0x202000
|
|
p0 = cmp.eq(r3, #0)
|
|
if (p0.new) jump:nt .Lskipprolog
|
|
}
|
|
l2fetch(r1, r4)
|
|
{
|
|
loop0(.Lprolog, r3)
|
|
r2 = sub(r2, r3) // r2 = number of words left after the prolog.
|
|
}
|
|
.falign
|
|
.Lprolog:
|
|
{
|
|
r4 = memw(r1++#4)
|
|
memw(r0++#4) = r4.new
|
|
} :endloop0
|
|
.Lskipprolog:
|
|
{
|
|
// Let r3 = number of whole pages left (page = 1024 words).
|
|
r3 = lsr(r2, #10)
|
|
if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
|
|
}
|
|
{
|
|
loop1(.Lout, r3)
|
|
r2 = extractu(r2, #10, #0) // r2 = r2 & 1023
|
|
r3 = ##2105472 // r3 = 0x202080 (prefetch info)
|
|
}
|
|
// Iterate over pages.
|
|
.falign
|
|
.Lout:
|
|
// Prefetch each individual page.
|
|
l2fetch(r1, r3)
|
|
loop0(.Lpage, #512)
|
|
.falign
|
|
.Lpage:
|
|
r5:4 = memd(r1++#8)
|
|
{
|
|
memw(r0++#8) = r4
|
|
memw(r0+#4) = r5
|
|
} :endloop0:endloop1
|
|
.Lskipmain:
|
|
{
|
|
r3 = ##2105344 // r3 = 0x202000 (prefetch info)
|
|
r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining.
|
|
p0 = cmp.eq(r2, #0)
|
|
if (p0.new) jumpr:nt r31
|
|
}
|
|
{
|
|
r3 = or(r3, r4)
|
|
loop0(.Lepilog, r2)
|
|
}
|
|
l2fetch(r1, r3)
|
|
.falign
|
|
.Lepilog:
|
|
{
|
|
r4 = memw(r1++#4)
|
|
memw(r0++#4) = r4.new
|
|
} :endloop0
|
|
|
|
jumpr r31
|
|
|
|
.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
|