diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 4dbfa2619be9..0d9ba26aa147 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -99,6 +99,7 @@ namespace {
       (void) llvm::createSCCPPass();
       (void) llvm::createScalarReplAggregatesPass();
       (void) llvm::createSimplifyLibCallsPass();
+      (void) llvm::createSimplifyHalfPowrLibCallsPass();
       (void) llvm::createSingleLoopExtractorPass();
       (void) llvm::createStripSymbolsPass();
       (void) llvm::createStripDeadPrototypesPass();
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index 52f7967af2e1..2c3fdd4a7884 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -317,6 +317,12 @@ Pass *createLoopDeletionPass();
 /// specific well-known (library) functions.
 FunctionPass *createSimplifyLibCallsPass();
 
+//===----------------------------------------------------------------------===//
+//
+/// createSimplifyHalfPowrLibCallsPass - This is an experimental pass that
+/// optimizes specific half_pow functions.
+FunctionPass *createSimplifyHalfPowrLibCallsPass();
+
 //===----------------------------------------------------------------------===//
 //
 // CodeGenPrepare - This pass prepares a function for instruction selection.
diff --git a/llvm/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp b/llvm/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
new file mode 100644
index 000000000000..530ad038cbe2
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
@@ -0,0 +1,159 @@
+//===- SimplifyHalfPowrLibCalls.cpp - Optimize specific half_powr calls ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple pass that applies an experimental
+// transformation on calls to specific functions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplify-libcalls-halfpowr"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Config/config.h"
+using namespace llvm;
+
+namespace {
+  /// This pass optimizes well half_powr function calls.
+  ///
+  class VISIBILITY_HIDDEN SimplifyHalfPowrLibCalls : public FunctionPass {
+    const TargetData *TD;
+  public:
+    static char ID; // Pass identification
+    SimplifyHalfPowrLibCalls() : FunctionPass(&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+
+    Instruction *
+    InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs,
+                    Instruction *InsertPt);
+  };
+  char SimplifyHalfPowrLibCalls::ID = 0;
+} // end anonymous namespace.
+
+static RegisterPass<SimplifyHalfPowrLibCalls>
+X("simplify-libcalls-halfpowr", "Simplify half_powr library calls");
+
+// Public interface to the Simplify HalfPowr LibCalls pass.
+FunctionPass *llvm::createSimplifyHalfPowrLibCallsPass() {
+  return new SimplifyHalfPowrLibCalls(); 
+}
+
+/// InlineHalfPowrs - Inline a sequence of adjacent half_powr calls, rearranging
+/// their control flow to better facilitate subsequent optimization.
+Instruction *
+SimplifyHalfPowrLibCalls::InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs,
+                                        Instruction *InsertPt) {
+  std::vector<BasicBlock *> Bodies;
+  BasicBlock *NewBlock = 0;
+
+  for (unsigned i = 0, e = HalfPowrs.size(); i != e; ++i) {
+    CallInst *Call = cast<CallInst>(HalfPowrs[i]);
+    Function *Callee = Call->getCalledFunction();
+
+    // Minimally sanity-check the CFG of half_powr to ensure that it contains
+    // the the kind of code we expect.  If we're running this pass, we have
+    // reason to believe it will be what we expect.
+    Function::iterator I = Callee->begin();
+    BasicBlock *Prologue = I++;
+    if (I == Callee->end()) break;
+    BasicBlock *SubnormalHandling = I++;
+    if (I == Callee->end()) break;
+    BasicBlock *Body = I++;
+    if (I != Callee->end()) break;
+    if (SubnormalHandling->getSinglePredecessor() != Prologue)
+      break;
+    BranchInst *PBI = dyn_cast<BranchInst>(Prologue->getTerminator());
+    if (!PBI || !PBI->isConditional())
+      break;
+    BranchInst *SNBI = dyn_cast<BranchInst>(SubnormalHandling->getTerminator());
+    if (!SNBI || SNBI->isConditional())
+      break;
+    if (!isa<ReturnInst>(Body->getTerminator()))
+      break;
+
+    Instruction *NextInst = next(BasicBlock::iterator(Call));
+
+    // Inline the call, taking care of what code ends up where.
+    NewBlock = SplitBlock(NextInst->getParent(), NextInst, this);
+
+    bool B = InlineFunction(Call, 0, TD);
+    assert(B && "half_powr didn't inline?");
+
+    BasicBlock *NewBody = NewBlock->getSinglePredecessor();
+    assert(NewBody);
+    Bodies.push_back(NewBody);
+  }
+
+  if (!NewBlock)
+    return InsertPt;
+
+  // Put the code for all the bodies into one block, to facilitate
+  // subsequent optimization.
+  (void)SplitEdge(NewBlock->getSinglePredecessor(), NewBlock, this);
+  for (unsigned i = 0, e = Bodies.size(); i != e; ++i) {
+    BasicBlock *Body = Bodies[i];
+    Instruction *FNP = Body->getFirstNonPHI();
+    // Splice the insts from body into NewBlock.
+    NewBlock->getInstList().splice(NewBlock->begin(), Body->getInstList(),
+                                   FNP, Body->getTerminator());
+  }
+
+  return NewBlock->begin();
+}
+
+/// runOnFunction - Top level algorithm.
+///
+bool SimplifyHalfPowrLibCalls::runOnFunction(Function &F) {
+  TD = &getAnalysis<TargetData>();
+  
+  bool Changed = false;
+  std::vector<Instruction *> HalfPowrs;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Look for calls.
+      bool IsHalfPowr = false;
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        // Look for direct calls and calls to non-external functions.
+        Function *Callee = CI->getCalledFunction();
+        if (Callee && Callee->hasExternalLinkage()) {
+          // Look for calls with well-known names.
+          const char *CalleeName = Callee->getNameStart();
+          if (strcmp(CalleeName, "__half_powrf4") == 0)
+            IsHalfPowr = true;
+        }
+      }
+      if (IsHalfPowr)
+        HalfPowrs.push_back(I);
+      // We're looking for sequences of up to three such calls, which we'll
+      // simplify as a group.
+      if ((!IsHalfPowr && !HalfPowrs.empty()) || HalfPowrs.size() == 3) {
+        I = InlineHalfPowrs(HalfPowrs, I);
+        E = I->getParent()->end();
+        HalfPowrs.clear();
+        Changed = true;
+      }
+    }
+    assert(HalfPowrs.empty() && "Block had no terminator!");
+  }
+
+  return Changed;
+}
diff --git a/llvm/test/Transforms/SimplifyLibCalls/half-powr.ll b/llvm/test/Transforms/SimplifyLibCalls/half-powr.ll
new file mode 100644
index 000000000000..f4e898c0b236
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyLibCalls/half-powr.ll
@@ -0,0 +1,41 @@
+; RUN: llvm-as < %s | opt -simplify-libcalls-halfpowr | llvm-dis | %prcontext {mul float} 1 | grep {mul float} | count 8
+
+define float @__half_powrf4(float %f, float %g) nounwind readnone {
+entry:
+	%0 = fcmp olt float %f, 2.000000e+00		; <i1> [#uses=1]
+	br i1 %0, label %bb, label %bb1
+
+bb:		; preds = %entry
+	%1 = fdiv float %f, 3.000000e+00		; <float> [#uses=1]
+	br label %bb1
+
+bb1:		; preds = %bb, %entry
+	%f_addr.0 = phi float [ %1, %bb ], [ %f, %entry ]		; <float> [#uses=1]
+	%2 = mul float %f_addr.0, %g		; <float> [#uses=1]
+	ret float %2
+}
+
+define void @foo(float* %p) nounwind {
+entry:
+	%0 = load float* %p, align 4		; <float> [#uses=1]
+	%1 = getelementptr float* %p, i32 1		; <float*> [#uses=1]
+	%2 = load float* %1, align 4		; <float> [#uses=1]
+	%3 = getelementptr float* %p, i32 2		; <float*> [#uses=1]
+	%4 = load float* %3, align 4		; <float> [#uses=1]
+	%5 = getelementptr float* %p, i32 3		; <float*> [#uses=1]
+	%6 = load float* %5, align 4		; <float> [#uses=1]
+	%7 = getelementptr float* %p, i32 4		; <float*> [#uses=1]
+	%8 = load float* %7, align 4		; <float> [#uses=1]
+	%9 = getelementptr float* %p, i32 5		; <float*> [#uses=1]
+	%10 = load float* %9, align 4		; <float> [#uses=1]
+	%11 = tail call float @__half_powrf4(float %0, float %6) nounwind		; <float> [#uses=1]
+	%12 = tail call float @__half_powrf4(float %2, float %8) nounwind		; <float> [#uses=1]
+	%13 = tail call float @__half_powrf4(float %4, float %10) nounwind		; <float> [#uses=1]
+	%14 = getelementptr float* %p, i32 6		; <float*> [#uses=1]
+	store float %11, float* %14, align 4
+	%15 = getelementptr float* %p, i32 7		; <float*> [#uses=1]
+	store float %12, float* %15, align 4
+	%16 = getelementptr float* %p, i32 8		; <float*> [#uses=1]
+	store float %13, float* %16, align 4
+	ret void
+}