From 621ad468d99d4013a4298465f02707a5e9e89cae Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 24 Dec 2020 05:08:09 -0800
Subject: [PATCH] [mlir] Async: lowering async.value to LLVM

1. Add new methods to Async runtime API to support yielding async values
2. Add lowering from `async.yield` with value payload to the new runtime API calls

`async.value` lowering requires that payload type is convertible to LLVM and supported by `llvm.mlir.cast` (DialectCast) operation.

Reviewed By: csigg

Differential Revision: https://reviews.llvm.org/D93592
---
 .../mlir/ExecutionEngine/AsyncRuntime.h       |  30 +
 .../Conversion/AsyncToLLVM/AsyncToLLVM.cpp    | 565 +++++++++++++-----
 .../lib/Conversion/AsyncToLLVM/CMakeLists.txt |   2 +
 mlir/lib/ExecutionEngine/AsyncRuntime.cpp     |  69 ++-
 .../AsyncToLLVM/convert-to-llvm.mlir          |  41 ++
 mlir/test/mlir-cpu-runner/async-value.mlir    |  64 ++
 6 files changed, 620 insertions(+), 151 deletions(-)
 create mode 100644 mlir/test/mlir-cpu-runner/async-value.mlir

diff --git a/mlir/include/mlir/ExecutionEngine/AsyncRuntime.h b/mlir/include/mlir/ExecutionEngine/AsyncRuntime.h
index e3d90198f36c..0fe44cd1c127 100644
--- a/mlir/include/mlir/ExecutionEngine/AsyncRuntime.h
+++ b/mlir/include/mlir/ExecutionEngine/AsyncRuntime.h
@@ -45,6 +45,12 @@ typedef struct AsyncToken AsyncToken;
 // Runtime implementation of `async.group` data type.
 typedef struct AsyncGroup AsyncGroup;
 
+// Runtime implementation of `async.value` data type.
+typedef struct AsyncValue AsyncValue;
+
+// Async value payload stored in a memory owned by the async.value.
+using ValueStorage = void *;
+
 // Async runtime uses LLVM coroutines to represent asynchronous tasks. Task
 // function is a coroutine handle and a resume function that continue coroutine
 // execution from a suspension point.
@@ -66,6 +72,13 @@ extern "C" MLIR_ASYNCRUNTIME_EXPORT void
 // Create a new `async.token` in not-ready state.
 extern "C" MLIR_ASYNCRUNTIME_EXPORT AsyncToken *mlirAsyncRuntimeCreateToken();
 
+// Create a new `async.value` in not-ready state. Size parameter specifies the
+// number of bytes that will be allocated for the async value storage. Storage
+// is owned by the `async.value` and deallocated when the async value is
+// destructed (reference count drops to zero).
+extern "C" MLIR_ASYNCRUNTIME_EXPORT AsyncValue *
+    mlirAsyncRuntimeCreateValue(int32_t);
+
 // Create a new `async.group` in empty state.
 extern "C" MLIR_ASYNCRUNTIME_EXPORT AsyncGroup *mlirAsyncRuntimeCreateGroup();
 
@@ -76,14 +89,26 @@ mlirAsyncRuntimeAddTokenToGroup(AsyncToken *, AsyncGroup *);
 extern "C" MLIR_ASYNCRUNTIME_EXPORT void
 mlirAsyncRuntimeEmplaceToken(AsyncToken *);
 
+// Switches `async.value` to ready state and runs all awaiters.
+extern "C" MLIR_ASYNCRUNTIME_EXPORT void
+mlirAsyncRuntimeEmplaceValue(AsyncValue *);
+
 // Blocks the caller thread until the token becomes ready.
 extern "C" MLIR_ASYNCRUNTIME_EXPORT void
 mlirAsyncRuntimeAwaitToken(AsyncToken *);
 
+// Blocks the caller thread until the value becomes ready.
+extern "C" MLIR_ASYNCRUNTIME_EXPORT void
+mlirAsyncRuntimeAwaitValue(AsyncValue *);
+
 // Blocks the caller thread until the elements in the group become ready.
 extern "C" MLIR_ASYNCRUNTIME_EXPORT void
 mlirAsyncRuntimeAwaitAllInGroup(AsyncGroup *);
 
+// Returns a pointer to the storage owned by the async value.
+extern "C" MLIR_ASYNCRUNTIME_EXPORT ValueStorage
+mlirAsyncRuntimeGetValueStorage(AsyncValue *);
+
 // Executes the task (coro handle + resume function) in one of the threads
 // managed by the runtime.
 extern "C" MLIR_ASYNCRUNTIME_EXPORT void mlirAsyncRuntimeExecute(CoroHandle,
@@ -94,6 +119,11 @@ extern "C" MLIR_ASYNCRUNTIME_EXPORT void mlirAsyncRuntimeExecute(CoroHandle,
 extern "C" MLIR_ASYNCRUNTIME_EXPORT void
 mlirAsyncRuntimeAwaitTokenAndExecute(AsyncToken *, CoroHandle, CoroResume);
 
+// Executes the task (coro handle + resume function) in one of the threads
+// managed by the runtime after the value becomes ready.
+extern "C" MLIR_ASYNCRUNTIME_EXPORT void
+mlirAsyncRuntimeAwaitValueAndExecute(AsyncValue *, CoroHandle, CoroResume);
+
 // Executes the task (coro handle + resume function) in one of the threads
 // managed by the runtime after the all members of the group become ready.
 extern "C" MLIR_ASYNCRUNTIME_EXPORT void
diff --git a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
index 3daa70b0a952..f1d6264606be 100644
--- a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
+++ b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
@@ -9,9 +9,11 @@
 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"
 
 #include "../PassDetail.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Dialect/Async/IR/Async.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Dialect/StandardOps/Transforms/FuncConversions.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/TypeUtilities.h"
@@ -36,23 +38,39 @@ static constexpr const char kAsyncFnPrefix[] = "async_execute_fn";
 static constexpr const char *kAddRef = "mlirAsyncRuntimeAddRef";
 static constexpr const char *kDropRef = "mlirAsyncRuntimeDropRef";
 static constexpr const char *kCreateToken = "mlirAsyncRuntimeCreateToken";
+static constexpr const char *kCreateValue = "mlirAsyncRuntimeCreateValue";
 static constexpr const char *kCreateGroup = "mlirAsyncRuntimeCreateGroup";
 static constexpr const char *kEmplaceToken = "mlirAsyncRuntimeEmplaceToken";
+static constexpr const char *kEmplaceValue = "mlirAsyncRuntimeEmplaceValue";
 static constexpr const char *kAwaitToken = "mlirAsyncRuntimeAwaitToken";
+static constexpr const char *kAwaitValue = "mlirAsyncRuntimeAwaitValue";
 static constexpr const char *kAwaitGroup = "mlirAsyncRuntimeAwaitAllInGroup";
 static constexpr const char *kExecute = "mlirAsyncRuntimeExecute";
+static constexpr const char *kGetValueStorage =
+    "mlirAsyncRuntimeGetValueStorage";
 static constexpr const char *kAddTokenToGroup =
     "mlirAsyncRuntimeAddTokenToGroup";
-static constexpr const char *kAwaitAndExecute =
+static constexpr const char *kAwaitTokenAndExecute =
     "mlirAsyncRuntimeAwaitTokenAndExecute";
+static constexpr const char *kAwaitValueAndExecute =
+    "mlirAsyncRuntimeAwaitValueAndExecute";
 static constexpr const char *kAwaitAllAndExecute =
     "mlirAsyncRuntimeAwaitAllInGroupAndExecute";
 
 namespace {
-// Async Runtime API function types.
+/// Async Runtime API function types.
+///
+/// Because we can't create API function signature for type parametrized
+/// async.value type, we use opaque pointers (!llvm.ptr<i8>) instead. After
+/// lowering all async data types become opaque pointers at runtime.
 struct AsyncAPI {
+  // All async types are lowered to opaque i8* LLVM pointers at runtime.
+  static LLVM::LLVMPointerType opaquePointerType(MLIRContext *ctx) {
+    return LLVM::LLVMPointerType::get(LLVM::LLVMIntegerType::get(ctx, 8));
+  }
+
   static FunctionType addOrDropRefFunctionType(MLIRContext *ctx) {
-    auto ref = LLVM::LLVMPointerType::get(LLVM::LLVMIntegerType::get(ctx, 8));
+    auto ref = opaquePointerType(ctx);
     auto count = IntegerType::get(ctx, 32);
     return FunctionType::get(ctx, {ref, count}, {});
   }
@@ -61,24 +79,46 @@ struct AsyncAPI {
     return FunctionType::get(ctx, {}, {TokenType::get(ctx)});
   }
 
+  static FunctionType createValueFunctionType(MLIRContext *ctx) {
+    auto i32 = IntegerType::get(ctx, 32);
+    auto value = opaquePointerType(ctx);
+    return FunctionType::get(ctx, {i32}, {value});
+  }
+
   static FunctionType createGroupFunctionType(MLIRContext *ctx) {
     return FunctionType::get(ctx, {}, {GroupType::get(ctx)});
   }
 
+  static FunctionType getValueStorageFunctionType(MLIRContext *ctx) {
+    auto value = opaquePointerType(ctx);
+    auto storage = opaquePointerType(ctx);
+    return FunctionType::get(ctx, {value}, {storage});
+  }
+
   static FunctionType emplaceTokenFunctionType(MLIRContext *ctx) {
     return FunctionType::get(ctx, {TokenType::get(ctx)}, {});
   }
 
+  static FunctionType emplaceValueFunctionType(MLIRContext *ctx) {
+    auto value = opaquePointerType(ctx);
+    return FunctionType::get(ctx, {value}, {});
+  }
+
   static FunctionType awaitTokenFunctionType(MLIRContext *ctx) {
     return FunctionType::get(ctx, {TokenType::get(ctx)}, {});
   }
 
+  static FunctionType awaitValueFunctionType(MLIRContext *ctx) {
+    auto value = opaquePointerType(ctx);
+    return FunctionType::get(ctx, {value}, {});
+  }
+
   static FunctionType awaitGroupFunctionType(MLIRContext *ctx) {
     return FunctionType::get(ctx, {GroupType::get(ctx)}, {});
   }
 
   static FunctionType executeFunctionType(MLIRContext *ctx) {
-    auto hdl = LLVM::LLVMPointerType::get(LLVM::LLVMIntegerType::get(ctx, 8));
+    auto hdl = opaquePointerType(ctx);
     auto resume = LLVM::LLVMPointerType::get(resumeFunctionType(ctx));
     return FunctionType::get(ctx, {hdl, resume}, {});
   }
@@ -89,14 +129,21 @@ struct AsyncAPI {
                              {i64});
   }
 
-  static FunctionType awaitAndExecuteFunctionType(MLIRContext *ctx) {
-    auto hdl = LLVM::LLVMPointerType::get(LLVM::LLVMIntegerType::get(ctx, 8));
+  static FunctionType awaitTokenAndExecuteFunctionType(MLIRContext *ctx) {
+    auto hdl = opaquePointerType(ctx);
     auto resume = LLVM::LLVMPointerType::get(resumeFunctionType(ctx));
     return FunctionType::get(ctx, {TokenType::get(ctx), hdl, resume}, {});
   }
 
+  static FunctionType awaitValueAndExecuteFunctionType(MLIRContext *ctx) {
+    auto value = opaquePointerType(ctx);
+    auto hdl = opaquePointerType(ctx);
+    auto resume = LLVM::LLVMPointerType::get(resumeFunctionType(ctx));
+    return FunctionType::get(ctx, {value, hdl, resume}, {});
+  }
+
   static FunctionType awaitAllAndExecuteFunctionType(MLIRContext *ctx) {
-    auto hdl = LLVM::LLVMPointerType::get(LLVM::LLVMIntegerType::get(ctx, 8));
+    auto hdl = opaquePointerType(ctx);
     auto resume = LLVM::LLVMPointerType::get(resumeFunctionType(ctx));
     return FunctionType::get(ctx, {GroupType::get(ctx), hdl, resume}, {});
   }
@@ -104,13 +151,13 @@ struct AsyncAPI {
   // Auxiliary coroutine resume intrinsic wrapper.
   static LLVM::LLVMType resumeFunctionType(MLIRContext *ctx) {
     auto voidTy = LLVM::LLVMVoidType::get(ctx);
-    auto i8Ptr = LLVM::LLVMPointerType::get(LLVM::LLVMIntegerType::get(ctx, 8));
+    auto i8Ptr = opaquePointerType(ctx);
     return LLVM::LLVMFunctionType::get(voidTy, {i8Ptr}, false);
   }
 };
 } // namespace
 
-// Adds Async Runtime C API declarations to the module.
+/// Adds Async Runtime C API declarations to the module.
 static void addAsyncRuntimeApiDeclarations(ModuleOp module) {
   auto builder = ImplicitLocOpBuilder::atBlockTerminator(module.getLoc(),
                                                          module.getBody());
@@ -125,13 +172,20 @@ static void addAsyncRuntimeApiDeclarations(ModuleOp module) {
   addFuncDecl(kAddRef, AsyncAPI::addOrDropRefFunctionType(ctx));
   addFuncDecl(kDropRef, AsyncAPI::addOrDropRefFunctionType(ctx));
   addFuncDecl(kCreateToken, AsyncAPI::createTokenFunctionType(ctx));
+  addFuncDecl(kCreateValue, AsyncAPI::createValueFunctionType(ctx));
   addFuncDecl(kCreateGroup, AsyncAPI::createGroupFunctionType(ctx));
   addFuncDecl(kEmplaceToken, AsyncAPI::emplaceTokenFunctionType(ctx));
+  addFuncDecl(kEmplaceValue, AsyncAPI::emplaceValueFunctionType(ctx));
   addFuncDecl(kAwaitToken, AsyncAPI::awaitTokenFunctionType(ctx));
+  addFuncDecl(kAwaitValue, AsyncAPI::awaitValueFunctionType(ctx));
   addFuncDecl(kAwaitGroup, AsyncAPI::awaitGroupFunctionType(ctx));
   addFuncDecl(kExecute, AsyncAPI::executeFunctionType(ctx));
+  addFuncDecl(kGetValueStorage, AsyncAPI::getValueStorageFunctionType(ctx));
   addFuncDecl(kAddTokenToGroup, AsyncAPI::addTokenToGroupFunctionType(ctx));
-  addFuncDecl(kAwaitAndExecute, AsyncAPI::awaitAndExecuteFunctionType(ctx));
+  addFuncDecl(kAwaitTokenAndExecute,
+              AsyncAPI::awaitTokenAndExecuteFunctionType(ctx));
+  addFuncDecl(kAwaitValueAndExecute,
+              AsyncAPI::awaitValueAndExecuteFunctionType(ctx));
   addFuncDecl(kAwaitAllAndExecute,
               AsyncAPI::awaitAllAndExecuteFunctionType(ctx));
 }
@@ -215,9 +269,9 @@ static void addCRuntimeDeclarations(ModuleOp module) {
 
 static constexpr const char *kResume = "__resume";
 
-// A function that takes a coroutine handle and calls a `llvm.coro.resume`
-// intrinsics. We need this function to be able to pass it to the async
-// runtime execute API.
+/// A function that takes a coroutine handle and calls a `llvm.coro.resume`
+/// intrinsics. We need this function to be able to pass it to the async
+/// runtime execute API.
 static void addResumeFunction(ModuleOp module) {
   MLIRContext *ctx = module.getContext();
 
@@ -248,49 +302,61 @@ static void addResumeFunction(ModuleOp module) {
 // async.execute op outlining to the coroutine functions.
 //===----------------------------------------------------------------------===//
 
-// Function targeted for coroutine transformation has two additional blocks at
-// the end: coroutine cleanup and coroutine suspension.
-//
-// async.await op lowering additionaly creates a resume block for each
-// operation to enable non-blocking waiting via coroutine suspension.
+/// Function targeted for coroutine transformation has two additional blocks at
+/// the end: coroutine cleanup and coroutine suspension.
+///
+/// async.await op lowering additionaly creates a resume block for each
+/// operation to enable non-blocking waiting via coroutine suspension.
 namespace {
 struct CoroMachinery {
-  Value asyncToken;
+  // Async execute region returns a completion token, and an async value for
+  // each yielded value.
+  //
+  //   %token, %result = async.execute -> !async.value<T> {
+  //     %0 = constant ... : T
+  //     async.yield %0 : T
+  //   }
+  Value asyncToken; // token representing completion of the async region
+  llvm::SmallVector<Value, 4> returnValues; // returned async values
+
   Value coroHandle;
   Block *cleanup;
   Block *suspend;
 };
 } // namespace
 
-// Builds an coroutine template compatible with LLVM coroutines lowering.
-//
-//  - `entry` block sets up the coroutine.
-//  - `cleanup` block cleans up the coroutine state.
-//  - `suspend block after the @llvm.coro.end() defines what value will be
-//    returned to the initial caller of a coroutine. Everything before the
-//    @llvm.coro.end() will be executed at every suspension point.
-//
-// Coroutine structure (only the important bits):
-//
-//   func @async_execute_fn(<function-arguments>) -> !async.token {
-//     ^entryBlock(<function-arguments>):
-//       %token = <async token> : !async.token // create async runtime token
-//       %hdl = llvm.call @llvm.coro.id(...)   // create a coroutine handle
-//       br ^cleanup
-//
-//     ^cleanup:
-//       llvm.call @llvm.coro.free(...)        // delete coroutine state
-//       br ^suspend
-//
-//     ^suspend:
-//       llvm.call @llvm.coro.end(...)         // marks the end of a coroutine
-//       return %token : !async.token
-//   }
-//
-// The actual code for the async.execute operation body region will be inserted
-// before the entry block terminator.
-//
-//
+/// Builds an coroutine template compatible with LLVM coroutines lowering.
+///
+///  - `entry` block sets up the coroutine.
+///  - `cleanup` block cleans up the coroutine state.
+///  - `suspend block after the @llvm.coro.end() defines what value will be
+///    returned to the initial caller of a coroutine. Everything before the
+///    @llvm.coro.end() will be executed at every suspension point.
+///
+/// Coroutine structure (only the important bits):
+///
+///   func @async_execute_fn(<function-arguments>)
+///        -> (!async.token, !async.value<T>)
+///   {
+///     ^entryBlock(<function-arguments>):
+///       %token = <async token> : !async.token    // create async runtime token
+///       %value = <async value> : !async.value<T> // create async value
+///       %hdl = llvm.call @llvm.coro.id(...)      // create a coroutine handle
+///       br ^cleanup
+///
+///     ^cleanup:
+///       llvm.call @llvm.coro.free(...)  // delete coroutine state
+///       br ^suspend
+///
+///     ^suspend:
+///       llvm.call @llvm.coro.end(...)  // marks the end of a coroutine
+///       return %token, %value : !async.token, !async.value<T>
+///   }
+///
+/// The actual code for the async.execute operation body region will be inserted
+/// before the entry block terminator.
+///
+///
 static CoroMachinery setupCoroMachinery(FuncOp func) {
   assert(func.getBody().empty() && "Function must have empty body");
 
@@ -312,6 +378,44 @@ static CoroMachinery setupCoroMachinery(FuncOp func) {
   // ------------------------------------------------------------------------ //
   auto createToken = builder.create<CallOp>(kCreateToken, TokenType::get(ctx));
 
+  // Async value operands and results must be convertible to LLVM types. This is
+  // verified before the function outlining.
+  LLVMTypeConverter converter(ctx);
+
+  // Returns the size requirements for the async value storage.
+  // http://nondot.org/sabre/LLVMNotes/SizeOf-OffsetOf-VariableSizedStructs.txt
+  auto sizeOf = [&](ValueType valueType) -> Value {
+    auto storedType = converter.convertType(valueType.getValueType());
+    auto storagePtrType =
+        LLVM::LLVMPointerType::get(storedType.cast<LLVM::LLVMType>());
+
+    // %Size = getelementptr %T* null, int 1
+    // %SizeI = ptrtoint %T* %Size to i32
+    auto nullPtr = builder.create<LLVM::NullOp>(loc, storagePtrType);
+    auto one = builder.create<LLVM::ConstantOp>(loc, i32,
+                                                builder.getI32IntegerAttr(1));
+    auto gep = builder.create<LLVM::GEPOp>(loc, storagePtrType, nullPtr,
+                                           one.getResult());
+    auto size = builder.create<LLVM::PtrToIntOp>(loc, i32, gep);
+
+    // Cast to std type because runtime API defined using std types.
+    return builder.create<LLVM::DialectCastOp>(loc, builder.getI32Type(),
+                                               size.getResult());
+  };
+
+  // We use the `async.value` type as a return type although it does not match
+  // the `kCreateValue` function signature, because it will be later lowered to
+  // the runtime type (opaque i8* pointer).
+  llvm::SmallVector<CallOp, 4> createValues;
+  for (auto resultType : func.getCallableResults().drop_front(1))
+    createValues.emplace_back(builder.create<CallOp>(
+        loc, kCreateValue, resultType, sizeOf(resultType.cast<ValueType>())));
+
+  auto createdValues = llvm::map_range(
+      createValues, [](CallOp call) { return call.getResult(0); });
+  llvm::SmallVector<Value, 4> returnValues(createdValues.begin(),
+                                           createdValues.end());
+
   // ------------------------------------------------------------------------ //
   // Initialize coroutine: allocate frame, get coroutine handle.
   // ------------------------------------------------------------------------ //
@@ -371,9 +475,11 @@ static CoroMachinery setupCoroMachinery(FuncOp func) {
   builder.create<LLVM::CallOp>(i1, builder.getSymbolRefAttr(kCoroEnd),
                                ValueRange({coroHdl.getResult(0), constFalse}));
 
-  // Return created `async.token` from the suspend block. This will be the
-  // return value of a coroutine ramp function.
-  builder.create<ReturnOp>(createToken.getResult(0));
+  // Return created `async.token` and `async.values` from the suspend block.
+  // This will be the return value of a coroutine ramp function.
+  SmallVector<Value, 4> ret{createToken.getResult(0)};
+  ret.insert(ret.end(), returnValues.begin(), returnValues.end());
+  builder.create<ReturnOp>(loc, ret);
 
   // Branch from the entry block to the cleanup block to create a valid CFG.
   builder.setInsertionPointToEnd(entryBlock);
@@ -383,39 +489,44 @@ static CoroMachinery setupCoroMachinery(FuncOp func) {
   // `async.await` op lowering will create resume blocks for async
   // continuations, and will conditionally branch to cleanup or suspend blocks.
 
-  return {createToken.getResult(0), coroHdl.getResult(0), cleanupBlock,
-          suspendBlock};
+  CoroMachinery machinery;
+  machinery.asyncToken = createToken.getResult(0);
+  machinery.returnValues = returnValues;
+  machinery.coroHandle = coroHdl.getResult(0);
+  machinery.cleanup = cleanupBlock;
+  machinery.suspend = suspendBlock;
+  return machinery;
 }
 
-// Add a LLVM coroutine suspension point to the end of suspended block, to
-// resume execution in resume block. The caller is responsible for creating the
-// two suspended/resume blocks with the desired ops contained in each block.
-// This function merely provides the required control flow logic.
-//
-// `coroState` must be a value returned from the call to @llvm.coro.save(...)
-// intrinsic (saved coroutine state).
-//
-// Before:
-//
-//   ^bb0:
-//     "opBefore"(...)
-//     "op"(...)
-//   ^cleanup: ...
-//   ^suspend: ...
-//   ^resume:
-//     "op"(...)
-//
-// After:
-//
-//   ^bb0:
-//     "opBefore"(...)
-//     %suspend = llmv.call @llvm.coro.suspend(...)
-//     switch %suspend [-1: ^suspend, 0: ^resume, 1: ^cleanup]
-//   ^resume:
-//     "op"(...)
-//   ^cleanup: ...
-//   ^suspend: ...
-//
+/// Add a LLVM coroutine suspension point to the end of suspended block, to
+/// resume execution in resume block. The caller is responsible for creating the
+/// two suspended/resume blocks with the desired ops contained in each block.
+/// This function merely provides the required control flow logic.
+///
+/// `coroState` must be a value returned from the call to @llvm.coro.save(...)
+/// intrinsic (saved coroutine state).
+///
+/// Before:
+///
+///   ^bb0:
+///     "opBefore"(...)
+///     "op"(...)
+///   ^cleanup: ...
+///   ^suspend: ...
+///   ^resume:
+///     "op"(...)
+///
+/// After:
+///
+///   ^bb0:
+///     "opBefore"(...)
+///     %suspend = llmv.call @llvm.coro.suspend(...)
+///     switch %suspend [-1: ^suspend, 0: ^resume, 1: ^cleanup]
+///   ^resume:
+///     "op"(...)
+///   ^cleanup: ...
+///   ^suspend: ...
+///
 static void addSuspensionPoint(CoroMachinery coro, Value coroState,
                                Operation *op, Block *suspended, Block *resume,
                                OpBuilder &builder) {
@@ -461,10 +572,10 @@ static void addSuspensionPoint(CoroMachinery coro, Value coroState,
                                  /*falseDest=*/coro.cleanup);
 }
 
-// Outline the body region attached to the `async.execute` op into a standalone
-// function.
-//
-// Note that this is not reversible transformation.
+/// Outline the body region attached to the `async.execute` op into a standalone
+/// function.
+///
+/// Note that this is not reversible transformation.
 static std::pair<FuncOp, CoroMachinery>
 outlineExecuteOp(SymbolTable &symbolTable, ExecuteOp execute) {
   ModuleOp module = execute->getParentOfType<ModuleOp>();
@@ -475,6 +586,7 @@ outlineExecuteOp(SymbolTable &symbolTable, ExecuteOp execute) {
   // Collect all outlined function inputs.
   llvm::SetVector<mlir::Value> functionInputs(execute.dependencies().begin(),
                                               execute.dependencies().end());
+  assert(execute.operands().empty() && "operands are not supported");
   getUsedValuesDefinedAbove(execute.body(), functionInputs);
 
   // Collect types for the outlined function inputs and outputs.
@@ -535,15 +647,9 @@ outlineExecuteOp(SymbolTable &symbolTable, ExecuteOp execute) {
   valueMapping.map(functionInputs, func.getArguments());
 
   // Clone all operations from the execute operation body into the outlined
-  // function body, and replace all `async.yield` operations with a call
-  // to async runtime to emplace the result token.
-  for (Operation &op : execute.body().getOps()) {
-    if (isa<async::YieldOp>(op)) {
-      builder.create<CallOp>(kEmplaceToken, TypeRange(), coro.asyncToken);
-      continue;
-    }
+  // function body.
+  for (Operation &op : execute.body().getOps())
     builder.clone(op, valueMapping);
-  }
 
   // Replace the original `async.execute` with a call to outlined function.
   ImplicitLocOpBuilder callBuilder(loc, execute);
@@ -560,42 +666,38 @@ outlineExecuteOp(SymbolTable &symbolTable, ExecuteOp execute) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
+/// AsyncRuntimeTypeConverter only converts types from the Async dialect to
+/// their runtime type (opaque pointers) and does not convert any other types.
 class AsyncRuntimeTypeConverter : public TypeConverter {
 public:
-  AsyncRuntimeTypeConverter() { addConversion(convertType); }
+  AsyncRuntimeTypeConverter() {
+    addConversion([](Type type) { return type; });
+    addConversion(convertAsyncTypes);
+  }
 
-  static Type convertType(Type type) {
-    MLIRContext *ctx = type.getContext();
-    // Convert async tokens and groups to opaque pointers.
-    if (type.isa<TokenType, GroupType>())
-      return LLVM::LLVMPointerType::get(LLVM::LLVMIntegerType::get(ctx, 8));
-    return type;
+  static Optional<Type> convertAsyncTypes(Type type) {
+    if (type.isa<TokenType, GroupType, ValueType>())
+      return AsyncAPI::opaquePointerType(type.getContext());
+    return llvm::None;
   }
 };
 } // namespace
 
 //===----------------------------------------------------------------------===//
-// Convert types for all call operations to lowered async types.
+// Convert return operations that return async values from async regions.
 //===----------------------------------------------------------------------===//
 
 namespace {
-class CallOpOpConversion : public ConversionPattern {
+class ReturnOpOpConversion : public ConversionPattern {
 public:
-  explicit CallOpOpConversion(MLIRContext *ctx)
-      : ConversionPattern(CallOp::getOperationName(), 1, ctx) {}
+  explicit ReturnOpOpConversion(TypeConverter &converter, MLIRContext *ctx)
+      : ConversionPattern(ReturnOp::getOperationName(), 1, converter, ctx) {}
 
   LogicalResult
   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override {
-    AsyncRuntimeTypeConverter converter;
-
-    SmallVector<Type, 5> resultTypes;
-    converter.convertTypes(op->getResultTypes(), resultTypes);
-
-    CallOp call = cast<CallOp>(op);
-    rewriter.replaceOpWithNewOp<CallOp>(op, resultTypes, call.callee(),
-                                        operands);
-
+    rewriter.replaceOpWithNewOp<ReturnOp>(op, operands);
     return success();
   }
 };
@@ -611,8 +713,9 @@ namespace {
 template <typename RefCountingOp>
 class RefCountingOpLowering : public ConversionPattern {
 public:
-  explicit RefCountingOpLowering(MLIRContext *ctx, StringRef apiFunctionName)
-      : ConversionPattern(RefCountingOp::getOperationName(), 1, ctx),
+  explicit RefCountingOpLowering(TypeConverter &converter, MLIRContext *ctx,
+                                 StringRef apiFunctionName)
+      : ConversionPattern(RefCountingOp::getOperationName(), 1, converter, ctx),
         apiFunctionName(apiFunctionName) {}
 
   LogicalResult
@@ -634,18 +737,18 @@ private:
   StringRef apiFunctionName;
 };
 
-// async.drop_ref op lowering to mlirAsyncRuntimeDropRef function call.
+/// async.drop_ref op lowering to mlirAsyncRuntimeDropRef function call.
 class AddRefOpLowering : public RefCountingOpLowering<AddRefOp> {
 public:
-  explicit AddRefOpLowering(MLIRContext *ctx)
-      : RefCountingOpLowering(ctx, kAddRef) {}
+  explicit AddRefOpLowering(TypeConverter &converter, MLIRContext *ctx)
+      : RefCountingOpLowering(converter, ctx, kAddRef) {}
 };
 
-// async.create_group op lowering to mlirAsyncRuntimeCreateGroup function call.
+/// async.create_group op lowering to mlirAsyncRuntimeCreateGroup function call.
 class DropRefOpLowering : public RefCountingOpLowering<DropRefOp> {
 public:
-  explicit DropRefOpLowering(MLIRContext *ctx)
-      : RefCountingOpLowering(ctx, kDropRef) {}
+  explicit DropRefOpLowering(TypeConverter &converter, MLIRContext *ctx)
+      : RefCountingOpLowering(converter, ctx, kDropRef) {}
 };
 
 } // namespace
@@ -657,8 +760,9 @@ public:
 namespace {
 class CreateGroupOpLowering : public ConversionPattern {
 public:
-  explicit CreateGroupOpLowering(MLIRContext *ctx)
-      : ConversionPattern(CreateGroupOp::getOperationName(), 1, ctx) {}
+  explicit CreateGroupOpLowering(TypeConverter &converter, MLIRContext *ctx)
+      : ConversionPattern(CreateGroupOp::getOperationName(), 1, converter,
+                          ctx) {}
 
   LogicalResult
   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
@@ -677,8 +781,9 @@ public:
 namespace {
 class AddToGroupOpLowering : public ConversionPattern {
 public:
-  explicit AddToGroupOpLowering(MLIRContext *ctx)
-      : ConversionPattern(AddToGroupOp::getOperationName(), 1, ctx) {}
+  explicit AddToGroupOpLowering(TypeConverter &converter, MLIRContext *ctx)
+      : ConversionPattern(AddToGroupOp::getOperationName(), 1, converter, ctx) {
+  }
 
   LogicalResult
   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
@@ -706,10 +811,10 @@ template <typename AwaitType, typename AwaitableType>
 class AwaitOpLoweringBase : public ConversionPattern {
 protected:
   explicit AwaitOpLoweringBase(
-      MLIRContext *ctx,
+      TypeConverter &converter, MLIRContext *ctx,
       const llvm::DenseMap<FuncOp, CoroMachinery> &outlinedFunctions,
       StringRef blockingAwaitFuncName, StringRef coroAwaitFuncName)
-      : ConversionPattern(AwaitType::getOperationName(), 1, ctx),
+      : ConversionPattern(AwaitType::getOperationName(), 1, converter, ctx),
         outlinedFunctions(outlinedFunctions),
         blockingAwaitFuncName(blockingAwaitFuncName),
         coroAwaitFuncName(coroAwaitFuncName) {}
@@ -719,7 +824,7 @@ public:
   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override {
     // We can only await on one the `AwaitableType` (for `await` it can be
-    // only a `token`, for `await_all` it is a `group`).
+    // a `token` or a `value`, for `await_all` it must be a `group`).
     auto await = cast<AwaitType>(op);
     if (!await.operand().getType().template isa<AwaitableType>())
       return failure();
@@ -768,44 +873,163 @@ public:
       Block *resume = rewriter.splitBlock(suspended, Block::iterator(op));
       addSuspensionPoint(coro, coroSave.getResult(0), op, suspended, resume,
                          builder);
+
+      // Make sure that replacement value will be constructed in resume block.
+      rewriter.setInsertionPointToStart(resume);
     }
 
-    // Original operation was replaced by function call or suspension point.
-    rewriter.eraseOp(op);
+    // Replace or erase the await operation with the new value.
+    if (Value replaceWith = getReplacementValue(op, operands[0], rewriter))
+      rewriter.replaceOp(op, replaceWith);
+    else
+      rewriter.eraseOp(op);
 
     return success();
   }
 
+  virtual Value getReplacementValue(Operation *op, Value operand,
+                                    ConversionPatternRewriter &rewriter) const {
+    return Value();
+  }
+
 private:
   const llvm::DenseMap<FuncOp, CoroMachinery> &outlinedFunctions;
   StringRef blockingAwaitFuncName;
   StringRef coroAwaitFuncName;
 };
 
-// Lowering for `async.await` operation (only token operands are supported).
-class AwaitOpLowering : public AwaitOpLoweringBase<AwaitOp, TokenType> {
+/// Lowering for `async.await` with a token operand.
+class AwaitTokenOpLowering : public AwaitOpLoweringBase<AwaitOp, TokenType> {
   using Base = AwaitOpLoweringBase<AwaitOp, TokenType>;
 
 public:
-  explicit AwaitOpLowering(
-      MLIRContext *ctx,
+  explicit AwaitTokenOpLowering(
+      TypeConverter &converter, MLIRContext *ctx,
       const llvm::DenseMap<FuncOp, CoroMachinery> &outlinedFunctions)
-      : Base(ctx, outlinedFunctions, kAwaitToken, kAwaitAndExecute) {}
+      : Base(converter, ctx, outlinedFunctions, kAwaitToken,
+             kAwaitTokenAndExecute) {}
 };
 
-// Lowering for `async.await_all` operation.
+/// Lowering for `async.await` with a value operand.
+class AwaitValueOpLowering : public AwaitOpLoweringBase<AwaitOp, ValueType> {
+  using Base = AwaitOpLoweringBase<AwaitOp, ValueType>;
+
+public:
+  explicit AwaitValueOpLowering(
+      TypeConverter &converter, MLIRContext *ctx,
+      const llvm::DenseMap<FuncOp, CoroMachinery> &outlinedFunctions)
+      : Base(converter, ctx, outlinedFunctions, kAwaitValue,
+             kAwaitValueAndExecute) {}
+
+  Value
+  getReplacementValue(Operation *op, Value operand,
+                      ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
+    auto i8Ptr = AsyncAPI::opaquePointerType(rewriter.getContext());
+
+    // Get the underlying value type from the `async.value`.
+    auto await = cast<AwaitOp>(op);
+    auto valueType = await.operand().getType().cast<ValueType>().getValueType();
+
+    // Get a pointer to an async value storage from the runtime.
+    auto storage = rewriter.create<CallOp>(loc, kGetValueStorage,
+                                           TypeRange(i8Ptr), operand);
+
+    // Cast from i8* to the pointer pointer to LLVM type.
+    auto llvmValueType = getTypeConverter()->convertType(valueType);
+    auto castedStorage = rewriter.create<LLVM::BitcastOp>(
+        loc, LLVM::LLVMPointerType::get(llvmValueType.cast<LLVM::LLVMType>()),
+        storage.getResult(0));
+
+    // Load from the async value storage.
+    auto loaded = rewriter.create<LLVM::LoadOp>(loc, castedStorage.getResult());
+
+    // Cast from LLVM type to the expected value type. This cast will become
+    // no-op after lowering to LLVM.
+    return rewriter.create<LLVM::DialectCastOp>(loc, valueType, loaded);
+  }
+};
+
+/// Lowering for `async.await_all` operation.
 class AwaitAllOpLowering : public AwaitOpLoweringBase<AwaitAllOp, GroupType> {
   using Base = AwaitOpLoweringBase<AwaitAllOp, GroupType>;
 
 public:
   explicit AwaitAllOpLowering(
-      MLIRContext *ctx,
+      TypeConverter &converter, MLIRContext *ctx,
       const llvm::DenseMap<FuncOp, CoroMachinery> &outlinedFunctions)
-      : Base(ctx, outlinedFunctions, kAwaitGroup, kAwaitAllAndExecute) {}
+      : Base(converter, ctx, outlinedFunctions, kAwaitGroup,
+             kAwaitAllAndExecute) {}
 };
 
 } // namespace
 
+//===----------------------------------------------------------------------===//
+// async.yield op lowerings to the corresponding async runtime function calls.
+//===----------------------------------------------------------------------===//
+
+class YieldOpLowering : public ConversionPattern {
+public:
+  explicit YieldOpLowering(
+      TypeConverter &converter, MLIRContext *ctx,
+      const llvm::DenseMap<FuncOp, CoroMachinery> &outlinedFunctions)
+      : ConversionPattern(async::YieldOp::getOperationName(), 1, converter,
+                          ctx),
+        outlinedFunctions(outlinedFunctions) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Check if yield operation is inside the outlined coroutine function.
+    auto func = op->template getParentOfType<FuncOp>();
+    auto outlined = outlinedFunctions.find(func);
+    if (outlined == outlinedFunctions.end())
+      return op->emitOpError(
+          "async.yield is not inside the outlined coroutine function");
+
+    Location loc = op->getLoc();
+    const CoroMachinery &coro = outlined->getSecond();
+
+    // Store yielded values into the async values storage and emplace them.
+    auto i8Ptr = AsyncAPI::opaquePointerType(rewriter.getContext());
+
+    for (auto tuple : llvm::zip(operands, coro.returnValues)) {
+      // Store `yieldValue` into the `asyncValue` storage.
+      Value yieldValue = std::get<0>(tuple);
+      Value asyncValue = std::get<1>(tuple);
+
+      // Get an opaque i8* pointer to an async value storage from the runtime.
+      auto storage = rewriter.create<CallOp>(loc, kGetValueStorage,
+                                             TypeRange(i8Ptr), asyncValue);
+
+      // Cast storage pointer to the yielded value type.
+      auto castedStorage = rewriter.create<LLVM::BitcastOp>(
+          loc,
+          LLVM::LLVMPointerType::get(
+              yieldValue.getType().cast<LLVM::LLVMType>()),
+          storage.getResult(0));
+
+      // Store the yielded value into the async value storage.
+      rewriter.create<LLVM::StoreOp>(loc, yieldValue,
+                                     castedStorage.getResult());
+
+      // Emplace the `async.value` to mark it ready.
+      rewriter.create<CallOp>(loc, kEmplaceValue, TypeRange(), asyncValue);
+    }
+
+    // Emplace the completion token to mark it ready.
+    rewriter.create<CallOp>(loc, kEmplaceToken, TypeRange(), coro.asyncToken);
+
+    // Original operation was replaced by the function call(s).
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+
+private:
+  const llvm::DenseMap<FuncOp, CoroMachinery> &outlinedFunctions;
+};
+
 //===----------------------------------------------------------------------===//
 
 namespace {
@@ -818,15 +1042,38 @@ void ConvertAsyncToLLVMPass::runOnOperation() {
   ModuleOp module = getOperation();
   SymbolTable symbolTable(module);
 
+  MLIRContext *ctx = &getContext();
+
   // Outline all `async.execute` body regions into async functions (coroutines).
   llvm::DenseMap<FuncOp, CoroMachinery> outlinedFunctions;
 
+  // We use conversion to LLVM type to ensure that all `async.value` operands
+  // and results can be lowered to LLVM load and store operations.
+  LLVMTypeConverter llvmConverter(ctx);
+  llvmConverter.addConversion(AsyncRuntimeTypeConverter::convertAsyncTypes);
+
+  // Returns true if the `async.value` payload is convertible to LLVM.
+  auto isConvertibleToLlvm = [&](Type type) -> bool {
+    auto valueType = type.cast<ValueType>().getValueType();
+    return static_cast<bool>(llvmConverter.convertType(valueType));
+  };
+
   WalkResult outlineResult = module.walk([&](ExecuteOp execute) {
+    // All operands and results must be convertible to LLVM.
+    if (!llvm::all_of(execute.operands().getTypes(), isConvertibleToLlvm)) {
+      execute.emitOpError("operands payload must be convertible to LLVM type");
+      return WalkResult::interrupt();
+    }
+    if (!llvm::all_of(execute.results().getTypes(), isConvertibleToLlvm)) {
+      execute.emitOpError("results payload must be convertible to LLVM type");
+      return WalkResult::interrupt();
+    }
+
     // We currently do not support execute operations that have async value
     // operands or produce async results.
-    if (!execute.operands().empty() || !execute.results().empty()) {
-      execute.emitOpError("can't outline async.execute op with async value "
-                          "operands or returned async results");
+    if (!execute.operands().empty()) {
+      execute.emitOpError(
+          "can't outline async.execute op with async value operands");
       return WalkResult::interrupt();
     }
 
@@ -852,26 +1099,44 @@ void ConvertAsyncToLLVMPass::runOnOperation() {
   addCoroutineIntrinsicsDeclarations(module);
   addCRuntimeDeclarations(module);
 
-  MLIRContext *ctx = &getContext();
-
   // Convert async dialect types and operations to LLVM dialect.
   AsyncRuntimeTypeConverter converter;
   OwningRewritePatternList patterns;
 
+  // Convert async types in function signatures and function calls.
   populateFuncOpTypeConversionPattern(patterns, ctx, converter);
-  patterns.insert<CallOpOpConversion>(ctx);
-  patterns.insert<AddRefOpLowering, DropRefOpLowering>(ctx);
-  patterns.insert<CreateGroupOpLowering, AddToGroupOpLowering>(ctx);
-  patterns.insert<AwaitOpLowering, AwaitAllOpLowering>(ctx, outlinedFunctions);
+  populateCallOpTypeConversionPattern(patterns, ctx, converter);
+
+  // Convert return operations inside async.execute regions.
+  patterns.insert<ReturnOpOpConversion>(converter, ctx);
+
+  // Lower async operations to async runtime API calls.
+  patterns.insert<AddRefOpLowering, DropRefOpLowering>(converter, ctx);
+  patterns.insert<CreateGroupOpLowering, AddToGroupOpLowering>(converter, ctx);
+
+  // Use LLVM type converter to automatically convert between the async value
+  // payload type and LLVM type when loading/storing from/to the async
+  // value storage which is an opaque i8* pointer using LLVM load/store ops.
+  patterns
+      .insert<AwaitTokenOpLowering, AwaitValueOpLowering, AwaitAllOpLowering>(
+          llvmConverter, ctx, outlinedFunctions);
+  patterns.insert<YieldOpLowering>(llvmConverter, ctx, outlinedFunctions);
 
   ConversionTarget target(*ctx);
   target.addLegalOp<ConstantOp>();
   target.addLegalDialect<LLVM::LLVMDialect>();
+
+  // All operations from Async dialect must be lowered to the runtime API calls.
   target.addIllegalDialect<AsyncDialect>();
+
+  // Add dynamic legality constraints to apply conversions defined above.
   target.addDynamicallyLegalOp<FuncOp>(
       [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
-  target.addDynamicallyLegalOp<CallOp>(
-      [&](CallOp op) { return converter.isLegal(op.getResultTypes()); });
+  target.addDynamicallyLegalOp<ReturnOp>(
+      [&](ReturnOp op) { return converter.isLegal(op.getOperandTypes()); });
+  target.addDynamicallyLegalOp<CallOp>([&](CallOp op) {
+    return converter.isSignatureLegal(op.getCalleeType());
+  });
 
   if (failed(applyPartialConversion(module, target, std::move(patterns))))
     signalPassFailure();
diff --git a/mlir/lib/Conversion/AsyncToLLVM/CMakeLists.txt b/mlir/lib/Conversion/AsyncToLLVM/CMakeLists.txt
index 005712c6cb48..48c2e0155ae1 100644
--- a/mlir/lib/Conversion/AsyncToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/AsyncToLLVM/CMakeLists.txt
@@ -13,5 +13,7 @@ add_mlir_conversion_library(MLIRAsyncToLLVM
   LINK_LIBS PUBLIC
   MLIRAsync
   MLIRLLVMIR
+  MLIRStandardOpsTransforms
+  MLIRStandardToLLVM
   MLIRTransforms
   )
diff --git a/mlir/lib/ExecutionEngine/AsyncRuntime.cpp b/mlir/lib/ExecutionEngine/AsyncRuntime.cpp
index 3bfed86aa996..45bdcb3733b8 100644
--- a/mlir/lib/ExecutionEngine/AsyncRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/AsyncRuntime.cpp
@@ -114,6 +114,7 @@ static AsyncRuntime *getDefaultAsyncRuntimeInstance() {
   return runtime.get();
 }
 
+// Async token provides a mechanism to signal asynchronous operation completion.
 struct AsyncToken : public RefCounted {
   // AsyncToken created with a reference count of 2 because it will be returned
   // to the `async.execute` caller and also will be later on emplaced by the
@@ -130,6 +131,28 @@ struct AsyncToken : public RefCounted {
   std::vector<std::function<void()>> awaiters;
 };
 
+// Async value provides a mechanism to access the result of asynchronous
+// operations. It owns the storage that is used to store/load the value of the
+// underlying type, and a flag to signal if the value is ready or not.
+struct AsyncValue : public RefCounted {
+  // AsyncValue similar to an AsyncToken created with a reference count of 2.
+  AsyncValue(AsyncRuntime *runtime, int32_t size)
+      : RefCounted(runtime, /*count=*/2), storage(size) {}
+
+  // Internal state below guarded by a mutex.
+  std::mutex mu;
+  std::condition_variable cv;
+
+  bool ready = false;
+  std::vector<std::function<void()>> awaiters;
+
+  // Use vector of bytes to store async value payload.
+  std::vector<int8_t> storage;
+};
+
+// Async group provides a mechanism to group together multiple async tokens or
+// values to await on all of them together (wait for the completion of all
+// tokens or values added to the group).
 struct AsyncGroup : public RefCounted {
   AsyncGroup(AsyncRuntime *runtime)
       : RefCounted(runtime), pendingTokens(0), rank(0) {}
@@ -159,12 +182,18 @@ extern "C" void mlirAsyncRuntimeDropRef(RefCountedObjPtr ptr, int32_t count) {
   refCounted->dropRef(count);
 }
 
-// Create a new `async.token` in not-ready state.
+// Creates a new `async.token` in not-ready state.
 extern "C" AsyncToken *mlirAsyncRuntimeCreateToken() {
   AsyncToken *token = new AsyncToken(getDefaultAsyncRuntimeInstance());
   return token;
 }
 
+// Creates a new `async.value` in not-ready state.
+extern "C" AsyncValue *mlirAsyncRuntimeCreateValue(int32_t size) {
+  AsyncValue *value = new AsyncValue(getDefaultAsyncRuntimeInstance(), size);
+  return value;
+}
+
 // Create a new `async.group` in empty state.
 extern "C" AsyncGroup *mlirAsyncRuntimeCreateGroup() {
   AsyncGroup *group = new AsyncGroup(getDefaultAsyncRuntimeInstance());
@@ -228,18 +257,45 @@ extern "C" void mlirAsyncRuntimeEmplaceToken(AsyncToken *token) {
   token->dropRef();
 }
 
+// Switches `async.value` to ready state and runs all awaiters.
+extern "C" void mlirAsyncRuntimeEmplaceValue(AsyncValue *value) {
+  // Make sure that `dropRef` does not destroy the mutex owned by the lock.
+  {
+    std::unique_lock<std::mutex> lock(value->mu);
+    value->ready = true;
+    value->cv.notify_all();
+    for (auto &awaiter : value->awaiters)
+      awaiter();
+  }
+
+  // Async values created with a ref count `2` to keep value alive until the
+  // async task completes. Drop this reference explicitly when value emplaced.
+  value->dropRef();
+}
+
 extern "C" void mlirAsyncRuntimeAwaitToken(AsyncToken *token) {
   std::unique_lock<std::mutex> lock(token->mu);
   if (!token->ready)
     token->cv.wait(lock, [token] { return token->ready; });
 }
 
+extern "C" void mlirAsyncRuntimeAwaitValue(AsyncValue *value) {
+  std::unique_lock<std::mutex> lock(value->mu);
+  if (!value->ready)
+    value->cv.wait(lock, [value] { return value->ready; });
+}
+
 extern "C" void mlirAsyncRuntimeAwaitAllInGroup(AsyncGroup *group) {
   std::unique_lock<std::mutex> lock(group->mu);
   if (group->pendingTokens != 0)
     group->cv.wait(lock, [group] { return group->pendingTokens == 0; });
 }
 
+// Returns a pointer to the storage owned by the async value.
+extern "C" ValueStorage mlirAsyncRuntimeGetValueStorage(AsyncValue *value) {
+  return value->storage.data();
+}
+
 extern "C" void mlirAsyncRuntimeExecute(CoroHandle handle, CoroResume resume) {
   (*resume)(handle);
 }
@@ -255,6 +311,17 @@ extern "C" void mlirAsyncRuntimeAwaitTokenAndExecute(AsyncToken *token,
     token->awaiters.push_back([execute]() { execute(); });
 }
 
+extern "C" void mlirAsyncRuntimeAwaitValueAndExecute(AsyncValue *value,
+                                                     CoroHandle handle,
+                                                     CoroResume resume) {
+  std::unique_lock<std::mutex> lock(value->mu);
+  auto execute = [handle, resume]() { (*resume)(handle); };
+  if (value->ready)
+    execute();
+  else
+    value->awaiters.push_back([execute]() { execute(); });
+}
+
 extern "C" void mlirAsyncRuntimeAwaitAllInGroupAndExecute(AsyncGroup *group,
                                                           CoroHandle handle,
                                                           CoroResume resume) {
diff --git a/mlir/test/Conversion/AsyncToLLVM/convert-to-llvm.mlir b/mlir/test/Conversion/AsyncToLLVM/convert-to-llvm.mlir
index dadb28dbc082..dce0cf89628f 100644
--- a/mlir/test/Conversion/AsyncToLLVM/convert-to-llvm.mlir
+++ b/mlir/test/Conversion/AsyncToLLVM/convert-to-llvm.mlir
@@ -211,3 +211,44 @@ func @async_group_await_all(%arg0: f32, %arg1: memref<1xf32>) {
 
 // Emplace result token.
 // CHECK: call @mlirAsyncRuntimeEmplaceToken(%[[RET_1]])
+
+// -----
+
+// CHECK-LABEL: execute_and_return_f32
+func @execute_and_return_f32() -> f32 {
+ // CHECK: %[[RET:.*]]:2 = call @async_execute_fn
+  %token, %result = async.execute -> !async.value<f32> {
+    %c0 = constant 123.0 : f32
+    async.yield %c0 : f32
+  }
+
+  // CHECK: %[[STORAGE:.*]] = call @mlirAsyncRuntimeGetValueStorage(%[[RET]]#1)
+  // CHECK: %[[ST_F32:.*]] = llvm.bitcast %[[STORAGE]]
+  // CHECK: %[[LOADED:.*]] = llvm.load %[[ST_F32]] :  !llvm.ptr<float>
+  // CHECK: %[[CASTED:.*]] = llvm.mlir.cast %[[LOADED]] : !llvm.float to f32
+  %0 = async.await %result : !async.value<f32>
+
+  return %0 : f32
+}
+
+// Function outlined from the async.execute operation.
+// CHECK-LABEL: func private @async_execute_fn()
+// CHECK: %[[TOKEN:.*]] = call @mlirAsyncRuntimeCreateToken()
+// CHECK: %[[VALUE:.*]] = call @mlirAsyncRuntimeCreateValue
+// CHECK: %[[HDL:.*]] = llvm.call @llvm.coro.begin
+
+// Suspend coroutine in the beginning.
+// CHECK: call @mlirAsyncRuntimeExecute(%[[HDL]],
+// CHECK: llvm.call @llvm.coro.suspend
+
+// Emplace result value.
+// CHECK: %[[CST:.*]] = constant 1.230000e+02 : f32
+// CHECK: %[[LLVM_CST:.*]] = llvm.mlir.cast %[[CST]] : f32 to !llvm.float
+// CHECK: %[[STORAGE:.*]] = call @mlirAsyncRuntimeGetValueStorage(%[[VALUE]])
+// CHECK: %[[ST_F32:.*]] = llvm.bitcast %[[STORAGE]]
+// CHECK: llvm.store %[[LLVM_CST]], %[[ST_F32]] : !llvm.ptr<float>
+// CHECK: call @mlirAsyncRuntimeEmplaceValue(%[[VALUE]])
+
+// Emplace result token.
+// CHECK: call @mlirAsyncRuntimeEmplaceToken(%[[TOKEN]])
+
diff --git a/mlir/test/mlir-cpu-runner/async-value.mlir b/mlir/test/mlir-cpu-runner/async-value.mlir
new file mode 100644
index 000000000000..44b3b29e8491
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/async-value.mlir
@@ -0,0 +1,64 @@
+// RUN:   mlir-opt %s -async-ref-counting                                      \
+// RUN:               -convert-async-to-llvm                                   \
+// RUN:               -convert-vector-to-llvm                                  \
+// RUN:               -convert-std-to-llvm                                     \
+// RUN: | mlir-cpu-runner                                                      \
+// RUN:     -e main -entry-point-result=void -O0                               \
+// RUN:     -shared-libs=%linalg_test_lib_dir/libmlir_c_runner_utils%shlibext  \
+// RUN:     -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext    \
+// RUN:     -shared-libs=%linalg_test_lib_dir/libmlir_async_runtime%shlibext   \
+// RUN: | FileCheck %s --dump-input=always
+
+func @main() {
+
+  // ------------------------------------------------------------------------ //
+  // Blocking async.await outside of the async.execute.
+  // ------------------------------------------------------------------------ //
+  %token, %result = async.execute -> !async.value<f32> {
+    %0 = constant 123.456 : f32
+    async.yield %0 : f32
+  }
+  %1 = async.await %result : !async.value<f32>
+
+  // CHECK: 123.456
+  vector.print %1 : f32
+
+  // ------------------------------------------------------------------------ //
+  // Non-blocking async.await inside the async.execute
+  // ------------------------------------------------------------------------ //
+  %token0, %result0 = async.execute -> !async.value<f32> {
+    %token1, %result2 = async.execute -> !async.value<f32> {
+      %2 = constant 456.789 : f32
+      async.yield %2 : f32
+    }
+    %3 = async.await %result2 : !async.value<f32>
+    async.yield %3 : f32
+  }
+  %4 = async.await %result0 : !async.value<f32>
+
+  // CHECK: 456.789
+  vector.print %4 : f32
+
+  // ------------------------------------------------------------------------ //
+  // Memref allocated inside async.execute region.
+  // ------------------------------------------------------------------------ //
+  %token2, %result2 = async.execute[%token0] -> !async.value<memref<f32>> {
+    %5 = alloc() : memref<f32>
+    %c0 = constant 987.654 : f32
+    store %c0, %5[]: memref<f32>
+    async.yield %5 : memref<f32>
+  }
+  %6 = async.await %result2 : !async.value<memref<f32>>
+  %7 = memref_cast %6 :  memref<f32> to memref<*xf32>
+
+  // CHECK: Unranked Memref
+  // CHECK-SAME: rank = 0 offset = 0 sizes = [] strides = []
+  // CHECK-NEXT: [987.654]
+  call @print_memref_f32(%7): (memref<*xf32>) -> ()
+  dealloc %6 : memref<f32>
+
+  return
+}
+
+func private @print_memref_f32(memref<*xf32>)
+  attributes { llvm.emit_c_interface }