From 78c0a4423cfa5070082a405925420ed0d8f52484 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 4 Nov 2024 16:02:06 -0800
Subject: [PATCH 1/6] Add a few state-related cc ops

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Dialect/CC/CCOps.td  |  62 ++++++++++
 include/cudaq/Optimizer/Transforms/Passes.td |   5 +-
 lib/Frontend/nvqpp/ConvertExpr.cpp           |  13 +--
 lib/Optimizer/CodeGen/QuakeToCodegen.cpp     |  68 ++++++++++-
 lib/Optimizer/Transforms/DeleteStates.cpp    | 112 +++++++------------
 python/cudaq/kernel/ast_bridge.py            |   6 +-
 python/cudaq/kernel/kernel_builder.py        |  10 +-
 runtime/common/ArgumentConversion.cpp        |  23 +---
 runtime/cudaq/builder/kernel_builder.cpp     |  11 +-
 runtime/test/test_argument_conversion.cpp    |  20 +---
 test/AST-Quake/qalloc_state.cpp              |   9 +-
 test/Quake/delete_states.qke                 |  61 ++++------
 12 files changed, 218 insertions(+), 182 deletions(-)

diff --git a/include/cudaq/Optimizer/Dialect/CC/CCOps.td b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
index a58e3d403d..d58fc6bc33 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCOps.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
@@ -898,6 +898,68 @@ def cc_AddressOfOp : CCOp<"address_of", [Pure,
   }];
 }
 
+def cc_CreateStateOp : CCOp<"create_state", [Pure] > {
+  let summary = "Create state from data";
+  let description = [{
+    This operation takes a pointer to state data and creates a quantum state.
+    The operation can be optimized away in DeleteStates pass, or replaced
+    by an intrinsic runtime call on simulators.
+
+    ```mlir
+      %0 = cc.create_state %data: !cc.ptr<!cc.state>
+    ```
+  }];
+
+  let arguments = (ins
+    AnyPointerType:$data,
+    AnySignlessInteger:$length
+  );
+  let results = (outs AnyPointerType:$result);
+  let assemblyFormat = [{
+      $data `,` $length `:` functional-type(operands, results) attr-dict
+  }];
+}
+
+def cc_GetNumberOfQubitsOp : CCOp<"get_number_of_qubits", [Pure] > {
+  let summary = "Get number of qubits from a quantum state";
+  let description = [{
+    This operation takes a state pointer argument and returns a number of
+    qubits in the state. The operation can be optimized away in some passes
+    line ReplaceStateByKernel or DeleteStates, or replaced by an intrinsic
+    runtime call on simulators.
+
+    ```mlir
+      %0 = cc.get_number_of_qubits %state : i64
+    ```
+  }];
+
+  let arguments = (ins cc_PointerType:$state);
+  let results = (outs AnySignlessInteger:$result);
+  let assemblyFormat = [{
+      $state `:` functional-type(operands, results) attr-dict
+  }];
+}
+
+def cc_GetStateOp : CCOp<"get_state", [Pure] > {
+  let summary = "Get state from kernel with the provided name.";
+  let description = [{
+    This operation is created by argument synthesis of state pointer arguments
+    for quantum devices. It takes a kernel name as ASCIIZ string literal value
+    and returns the kernel's quantum state. The operation is replaced by a call
+    to the kernel with the provided name in ReplaceStateByKernel pass.
+
+    ```mlir
+      %0 = cc.get_state "callee" : !cc.ptr<!cc.state>
+    ```
+  }];
+
+  let arguments = (ins StrAttr:$calleeName);
+  let results = (outs cc_PointerType:$result);
+  let assemblyFormat = [{
+     $calleeName `:` qualified(type(results)) attr-dict
+  }];
+}
+
 def cc_GlobalOp : CCOp<"global", [IsolatedFromAbove, Symbol]> {
   let summary = "Create a global constant or variable";
   let description = [{
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index da6f3163b3..04964037c1 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -752,9 +752,8 @@ def DeleteStates : Pass<"delete-states", "mlir::ModuleOp"> {
     func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
       %c8_i64 = arith.constant 8 : i64
       %0 = cc.address_of @foo.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-      %3 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-      %4 = call @__nvqpp_cudaq_state_createFromData_fp32(%3, %c8_i64) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-      %5 = call @__nvqpp_cudaq_state_numberOfQubits(%4) : (!cc.ptr<!cc.state>) -> i64
+      %4 = cc.create_state %3, %c8_i64  : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+      %5 = cc.get_number_of_qubits %4 : (!cc.ptr<!cc.state>) -> i64
       %6 = quake.alloca !quake.veq<?>[%5 : i64]
       %7 = quake.init_state %6, %4 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
index e6350d1c5c..fa0fd326f1 100644
--- a/lib/Frontend/nvqpp/ConvertExpr.cpp
+++ b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -2694,19 +2694,12 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
             initials = load.getPtrvalue();
         }
         if (isStateType(initials.getType())) {
-          IRBuilder irBuilder(builder.getContext());
-          auto mod =
-              builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
-          auto result =
-              irBuilder.loadIntrinsic(mod, getNumQubitsFromCudaqState);
-          assert(succeeded(result) && "loading intrinsic should never fail");
           Value state = initials;
           auto i64Ty = builder.getI64Type();
-          auto numQubits = builder.create<func::CallOp>(
-              loc, i64Ty, getNumQubitsFromCudaqState, ValueRange{state});
+          auto numQubits =
+              builder.create<cudaq::cc::GetNumberOfQubitsOp>(loc, i64Ty, state);
           auto veqTy = quake::VeqType::getUnsized(ctx);
-          Value alloc = builder.create<quake::AllocaOp>(loc, veqTy,
-                                                        numQubits.getResult(0));
+          Value alloc = builder.create<quake::AllocaOp>(loc, veqTy, numQubits);
           return pushValue(builder.create<quake::InitializeStateOp>(
               loc, veqTy, alloc, state));
         }
diff --git a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
index e9e56f8f5f..6e913a2bec 100644
--- a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
@@ -8,6 +8,9 @@
 
 #include "QuakeToCodegen.h"
 #include "CodeGenOps.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
+#include "cudaq/Optimizer/CodeGen/Passes.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
@@ -62,10 +65,73 @@ class ExpandComplexCast : public OpRewritePattern<cudaq::cc::CastOp> {
     return success();
   }
 };
+
+class CreateStateOpPattern : public OpRewritePattern<cudaq::cc::CreateStateOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(cudaq::cc::CreateStateOp createStateOp,
+                                PatternRewriter &rewriter) const override {
+    auto module = createStateOp->getParentOfType<ModuleOp>();
+    auto loc = createStateOp.getLoc();
+    auto ctx = createStateOp.getContext();
+    auto buffer = createStateOp.getOperand(0);
+    auto size = createStateOp.getOperand(1);
+
+    auto bufferTy = buffer.getType();
+    auto ptrTy = cast<cudaq::cc::PointerType>(bufferTy);
+    auto arrTy = cast<cudaq::cc::ArrayType>(ptrTy.getElementType());
+    auto eleTy = arrTy.getElementType();
+    auto is64Bit = isa<Float64Type>(eleTy);
+
+    if (auto cTy = dyn_cast<ComplexType>(eleTy))
+      is64Bit = isa<Float64Type>(eleTy);
+
+    auto createStateFunc = is64Bit ? cudaq::createCudaqStateFromDataFP64
+                                   : cudaq::createCudaqStateFromDataFP32;
+    cudaq::IRBuilder irBuilder(ctx);
+    auto result = irBuilder.loadIntrinsic(module, createStateFunc);
+    assert(succeeded(result) && "loading intrinsic should never fail");
+
+    auto stateTy = cudaq::cc::StateType::get(ctx);
+    auto statePtrTy = cudaq::cc::PointerType::get(stateTy);
+    auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
+    auto cast = rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
+
+    rewriter.replaceOpWithNewOp<func::CallOp>(
+        createStateOp, statePtrTy, createStateFunc, ValueRange{cast, size});
+    return success();
+  }
+};
+
+class GetNumberOfQubitsOpPattern
+    : public OpRewritePattern<cudaq::cc::GetNumberOfQubitsOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(cudaq::cc::GetNumberOfQubitsOp getNumQubitsOp,
+                                PatternRewriter &rewriter) const override {
+    auto module = getNumQubitsOp->getParentOfType<ModuleOp>();
+    auto ctx = getNumQubitsOp.getContext();
+    auto state = getNumQubitsOp.getOperand();
+
+    cudaq::IRBuilder irBuilder(ctx);
+    auto result =
+        irBuilder.loadIntrinsic(module, cudaq::getNumQubitsFromCudaqState);
+    assert(succeeded(result) && "loading intrinsic should never fail");
+
+    rewriter.replaceOpWithNewOp<func::CallOp>(
+        getNumQubitsOp, rewriter.getI64Type(),
+        cudaq::getNumQubitsFromCudaqState, state);
+    return success();
+  }
+};
+
 } // namespace
 
 void cudaq::codegen::populateQuakeToCodegenPatterns(
     mlir::RewritePatternSet &patterns) {
   auto *ctx = patterns.getContext();
-  patterns.insert<CodeGenRAIIPattern, ExpandComplexCast>(ctx);
+  patterns.insert<CodeGenRAIIPattern, ExpandComplexCast, CreateStateOpPattern,
+                  GetNumberOfQubitsOpPattern>(ctx);
 }
diff --git a/lib/Optimizer/Transforms/DeleteStates.cpp b/lib/Optimizer/Transforms/DeleteStates.cpp
index 7cc7bca044..74b3a432c2 100644
--- a/lib/Optimizer/Transforms/DeleteStates.cpp
+++ b/lib/Optimizer/Transforms/DeleteStates.cpp
@@ -29,104 +29,79 @@ namespace cudaq::opt {
 using namespace mlir;
 
 namespace {
-
-static bool isCall(Operation *callOp, std::vector<const char *> &&names) {
-  if (callOp) {
-    if (auto createStateCall = dyn_cast<func::CallOp>(callOp)) {
-      if (auto calleeAttr = createStateCall.getCalleeAttr()) {
-        auto funcName = calleeAttr.getValue().str();
-        if (std::find(names.begin(), names.end(), funcName) != names.end())
-          return true;
-      }
-    }
-  }
-  return false;
-}
-
-static bool isCreateStateCall(Operation *callOp) {
-  return isCall(callOp, {cudaq::createCudaqStateFromDataFP64,
-                         cudaq::createCudaqStateFromDataFP32});
-}
-
-static bool isNumberOfQubitsCall(Operation *callOp) {
-  return isCall(callOp, {cudaq::getNumQubitsFromCudaqState});
-}
-
-/// For a call to `__nvqpp_cudaq_state_createFromData_fpXX`, get the number of
-/// qubits allocated.
-static std::size_t getStateSize(Operation *callOp) {
-  if (isCreateStateCall(callOp)) {
-    if (auto createStateCall = dyn_cast<func::CallOp>(callOp)) {
-      auto sizeOperand = createStateCall.getOperand(1);
-      auto defOp = sizeOperand.getDefiningOp();
-      while (defOp && !dyn_cast<arith::ConstantIntOp>(defOp))
-        defOp = defOp->getOperand(0).getDefiningOp();
-      if (auto constOp = dyn_cast<arith::ConstantIntOp>(defOp))
-        return constOp.getValue().cast<IntegerAttr>().getInt();
-    }
+/// For a `cc:CreateStateOp`, get the number of qubits allocated.
+static std::size_t getStateSize(Operation *op) {
+  if (auto createStateOp = dyn_cast<cudaq::cc::CreateStateOp>(op)) {
+    auto sizeOperand = createStateOp.getOperand(1);
+    auto defOp = sizeOperand.getDefiningOp();
+    while (defOp && !dyn_cast<arith::ConstantIntOp>(defOp))
+      defOp = defOp->getOperand(0).getDefiningOp();
+    if (auto constOp = dyn_cast<arith::ConstantIntOp>(defOp))
+      return constOp.getValue().cast<IntegerAttr>().getInt();
   }
-  callOp->emitError("Cannot compute number of qubits");
+  op->emitError("Cannot compute number of qubits from createStateOp");
   return 0;
 }
 
 // clang-format off
-/// Remove `__nvqpp_cudaq_state_numberOfQubits` calls.
+/// Replace `cc.get_number_of_qubits` by a constant.
 /// ```
-/// %1 = arith.constant 8 : i64
-/// %2 = call @__nvqpp_cudaq_state_createFromData_fp32(%0, %1) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-/// %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
+/// %c8_i64 = arith.constant 8 : i64
+/// %2 = cc.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+/// %3 = cc.get_number_of_qubits %2 : i64
 /// ...
 /// ───────────────────────────────────────────
-/// %1 = arith.constant 8 : i64
-/// %2 = call @__nvqpp_cudaq_state_createFromData_fp32(%0, %1) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-/// %5 = arith.constant 3 : i64
+/// %c8_i64 = arith.constant 8 : i64
+/// %2 = cc.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+/// %3 = arith.constant 3 : i64
 /// ```
 // clang-format on
-class NumberOfQubitsPattern : public OpRewritePattern<func::CallOp> {
+class NumberOfQubitsPattern
+    : public OpRewritePattern<cudaq::cc::GetNumberOfQubitsOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(func::CallOp callOp,
+  LogicalResult matchAndRewrite(cudaq::cc::GetNumberOfQubitsOp op,
                                 PatternRewriter &rewriter) const override {
-    if (isNumberOfQubitsCall(callOp)) {
-      auto createStateOp = callOp.getOperand(0).getDefiningOp();
-      if (isCreateStateCall(createStateOp)) {
-        auto size = getStateSize(createStateOp);
-        rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(
-            callOp, std::countr_zero(size), rewriter.getI64Type());
-        return success();
-      }
+    auto stateOp = op.getOperand();
+    if (auto createStateOp =
+            stateOp.getDefiningOp<cudaq::cc::CreateStateOp>()) {
+      auto size = getStateSize(createStateOp);
+      rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(
+          op, std::countr_zero(size), rewriter.getI64Type());
+      return success();
     }
     return failure();
   }
 };
 
 // clang-format off
-/// Replace calls to `__nvqpp_cudaq_state_numberOfQubits` by a constant.
+/// Remove `cc.create_state` instructions and pass their data directly to
+/// the `quake.state_init` instruction instead.
 /// ```
 /// %2 = cc.cast %1 : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-/// %3 = call @__nvqpp_cudaq_state_createFromData_fp32(%2, %c8_i64) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+/// %3 = cc.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
 /// %4 = quake.alloca !quake.veq<?>[%0 : i64]
 /// %5 = quake.init_state %4, %3 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 /// ───────────────────────────────────────────
 /// ...
-/// %3 = call @__nvqpp_cudaq_state_createFromData_fp32(%2, %c8_i64) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
 /// %4 = quake.alloca !quake.veq<?>[%0 : i64]
 /// %5 = quake.init_state %4, %1 : (!quake.veq<?>, !cc.ptr<!cc.array<complex<f32> x 8>>) -> !quake.veq<?>
 /// ```
 // clang-format on
+
 class StateToDataPattern : public OpRewritePattern<quake::InitializeStateOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
                                 PatternRewriter &rewriter) const override {
-    auto stateOp = initState.getOperand(1).getDefiningOp();
+    auto state = initState.getOperand(1);
     auto targets = initState.getTargets();
 
-    if (isCreateStateCall(stateOp)) {
-      auto dataOp = stateOp->getOperand(0);
-      if (auto cast = dyn_cast<cudaq::cc::CastOp>(dataOp.getDefiningOp()))
+    if (auto createStateOp = state.getDefiningOp<cudaq::cc::CreateStateOp>()) {
+      auto dataOp = createStateOp->getOperand(0);
+      if (auto cast = dataOp.getDefiningOp<cudaq::cc::CastOp>())
         dataOp = cast.getOperand();
       rewriter.replaceOpWithNewOp<quake::InitializeStateOp>(
           initState, targets.getType(), targets, dataOp);
@@ -163,10 +138,8 @@ class DeleteStatesPass
       llvm::SmallVector<Operation *> usedStates;
 
       func.walk([&](Operation *op) {
-        if (isCreateStateCall(op)) {
-          if (op->getUses().empty())
-            op->erase();
-          else
+        if (isa<cudaq::cc::CreateStateOp>(op)) {
+          if (!op->getUses().empty())
             usedStates.push_back(op);
         }
       });
@@ -178,15 +151,16 @@ class DeleteStatesPass
         func.walk([&](Operation *op) {
           if (isa<func::ReturnOp>(op)) {
             auto loc = op->getLoc();
-            auto deleteState = cudaq::deleteCudaqState;
-            auto result = irBuilder.loadIntrinsic(module, deleteState);
+            auto result =
+                irBuilder.loadIntrinsic(module, cudaq::deleteCudaqState);
             assert(succeeded(result) && "loading intrinsic should never fail");
 
             builder.setInsertionPoint(op);
             for (auto createStateOp : usedStates) {
-              auto results = cast<func::CallOp>(createStateOp).getResults();
-              builder.create<func::CallOp>(loc, std::nullopt, deleteState,
-                                           results);
+              auto result = cast<cudaq::cc::CreateStateOp>(createStateOp);
+              builder.create<func::CallOp>(loc, std::nullopt,
+                                           cudaq::deleteCudaqState,
+                                           mlir::ValueRange{result});
             }
           }
         });
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index cae278143f..13d694d7bc 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -2246,11 +2246,9 @@ def bodyBuilder(iterVal):
                         # handle `cudaq.qvector(state)`
                         statePtr = self.ifNotPointerThenStore(valueOrPtr)
 
-                        symName = '__nvqpp_cudaq_state_numberOfQubits'
-                        load_intrinsic(self.module, symName)
                         i64Ty = self.getIntegerType()
-                        numQubits = func.CallOp([i64Ty], symName,
-                                                [statePtr]).result
+                        numQubits = cc.GetNumberOfQubitsOp(i64Ty,
+                                                           statePtr).result
 
                         veqTy = quake.VeqType.get(self.ctx)
                         qubits = quake.AllocaOp(veqTy, size=numQubits).result
diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index e8d6345ffb..9f528acfee 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -777,10 +777,8 @@ def qalloc(self, initializer=None):
             if isinstance(initializer, cudaq_runtime.State):
                 statePtr = self.capturedDataStorage.storeCudaqState(initializer)
 
-                symName = '__nvqpp_cudaq_state_numberOfQubits'
-                load_intrinsic(self.module, symName)
                 i64Ty = self.getIntegerType()
-                numQubits = func.CallOp([i64Ty], symName, [statePtr]).result
+                numQubits = cc.GetNumberOfQubitsOp(i64Ty, statePtr).result
 
                 veqTy = quake.VeqType.get(self.ctx)
                 qubits = quake.AllocaOp(veqTy, size=numQubits).result
@@ -816,11 +814,9 @@ def qalloc(self, initializer=None):
                     if cc.StateType.isinstance(valueTy):
                         statePtr = initializer.mlirValue
 
-                        symName = '__nvqpp_cudaq_state_numberOfQubits'
-                        load_intrinsic(self.module, symName)
                         i64Ty = self.getIntegerType()
-                        numQubits = func.CallOp([i64Ty], symName,
-                                                [statePtr]).result
+                        numQubits = cc.GetNumberOfQubitsOp(i64Ty,
+                                                           statePtr).result
 
                         veqTy = quake.VeqType.get(self.ctx)
                         qubits = quake.AllocaOp(veqTy, size=numQubits).result
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 0de2589752..09ddb9c74b 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -130,33 +130,18 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
       std::string name =
           kernelName.str() + ".rodata_synth_" + std::to_string(counter++);
       irBuilder.genVectorOfConstants(loc, substMod, name, vec);
-      auto conGlobal = builder.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
-      return builder.create<cudaq::cc::LoadOp>(loc, arrTy, conGlobal);
+      return builder.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
     };
 
-    auto conArr = is64Bit ? genConArray.template operator()<double>()
+    auto buffer = is64Bit ? genConArray.template operator()<double>()
                           : genConArray.template operator()<float>();
 
-    auto createState = is64Bit ? cudaq::createCudaqStateFromDataFP64
-                               : cudaq::createCudaqStateFromDataFP32;
-    auto result = irBuilder.loadIntrinsic(substMod, createState);
-    assert(succeeded(result) && "loading intrinsic should never fail");
-
     auto arrSize = builder.create<arith::ConstantIntOp>(loc, size, 64);
     auto stateTy = cudaq::cc::StateType::get(ctx);
     auto statePtrTy = cudaq::cc::PointerType::get(stateTy);
-    auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-    auto buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrTy);
-    builder.create<cudaq::cc::StoreOp>(loc, conArr, buffer);
-
-    auto cast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
-    auto statePtr = builder
-                        .create<func::CallOp>(loc, statePtrTy, createState,
-                                              ValueRange{cast, arrSize})
-                        .getResult(0);
 
-    // TODO: Delete the new state before function exit.
-    return builder.create<cudaq::cc::CastOp>(loc, statePtrTy, statePtr);
+    return builder.create<cudaq::cc::CreateStateOp>(loc, statePtrTy, buffer,
+                                                    arrSize);
   }
   // The program is executed on quantum hardware, state data is not
   // available and needs to be regenerated.
diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp
index 6961cc547f..ebf10a6978 100644
--- a/runtime/cudaq/builder/kernel_builder.cpp
+++ b/runtime/cudaq/builder/kernel_builder.cpp
@@ -514,16 +514,11 @@ QuakeValue qalloc(ImplicitLocOpBuilder &builder, QuakeValue &sizeOrVec) {
     auto eleTy = statePtrTy.getElementType();
     if (auto stateTy = dyn_cast<cc::StateType>(eleTy)) {
       // get the number of qubits
-      IRBuilder irBuilder(context);
-      auto mod = builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
-      auto result = irBuilder.loadIntrinsic(mod, getNumQubitsFromCudaqState);
-      assert(succeeded(result) && "loading intrinsic should never fail");
-      auto numQubits = builder.create<func::CallOp>(
-          builder.getI64Type(), getNumQubitsFromCudaqState, ValueRange{value});
+      auto numQubits = builder.create<cudaq::cc::GetNumberOfQubitsOp>(
+          builder.getI64Type(), value);
       // allocate the number of qubits we need
       auto veqTy = quake::VeqType::getUnsized(context);
-      Value qubits =
-          builder.create<quake::AllocaOp>(veqTy, numQubits.getResult(0));
+      Value qubits = builder.create<quake::AllocaOp>(veqTy, numQubits);
       // Add the initialize state op
       qubits = builder.create<quake::InitializeStateOp>(qubits.getType(),
                                                         qubits, value);
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 9fe3d92f8f..1326ac4d39 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -380,16 +380,10 @@ void test_state(mlir::MLIRContext *ctx) {
 
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_2:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 8>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_4]], %[[VAL_2]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_2:.*]] = cc.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>, i64) -> !cc.ptr<!cc.state>
 // CHECK:        }
 // CHECK-DAG:    cc.global constant @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
-// CHECK-DAG:    func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
   // clang-format on
 }
 
@@ -490,16 +484,10 @@ void test_combinations(mlir::MLIRContext *ctx) {
 // CHECK:         }
 // CHECK-LABEL:   cc.arg_subst[1] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_2:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 8>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_4]], %[[VAL_2]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_5:.*]] = cc.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>>, i64) -> !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK-DAG:     cc.global constant @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
-// CHECK-DAG:     func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
 // CHECK-LABEL:   cc.arg_subst[2] {
 // CHECK:           %[[VAL_0:.*]] = cc.alloca !cc.array<!cc.charspan x 2>
 // CHECK:           %[[VAL_1:.*]] = cc.address_of @cstr.585800 : !cc.ptr<!llvm.array<3 x i8>>
diff --git a/test/AST-Quake/qalloc_state.cpp b/test/AST-Quake/qalloc_state.cpp
index 191c9c3a30..822f1e1f56 100644
--- a/test/AST-Quake/qalloc_state.cpp
+++ b/test/AST-Quake/qalloc_state.cpp
@@ -20,7 +20,7 @@ struct Eins {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Eins(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_0]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
@@ -34,7 +34,7 @@ struct Zwei {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Zwei(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_0]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
@@ -48,7 +48,7 @@ struct Drei {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Drei(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_0]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
@@ -62,8 +62,7 @@ struct Vier {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Vier(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_0]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
-// CHECK: func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
diff --git a/test/Quake/delete_states.qke b/test/Quake/delete_states.qke
index caa7cca621..bc9c3e1d47 100644
--- a/test/Quake/delete_states.qke
+++ b/test/Quake/delete_states.qke
@@ -12,33 +12,24 @@ module {
   func.func @__nvqpp__mlirgen__function_test_state_param._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %c8_i64 = arith.constant 8 : i64
     %0 = cc.address_of @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %1 = cc.load %0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %2 = cc.alloca !cc.array<complex<f32> x 8>
-    cc.store %1, %2 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %3 = cc.cast %2 : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-    %4 = call @__nvqpp_cudaq_state_createFromData_fp32(%3, %c8_i64) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-    %5 = call @__nvqpp_cudaq_state_numberOfQubits(%4) : (!cc.ptr<!cc.state>) -> i64
-    %6 = quake.alloca !quake.veq<?>[%5 : i64]
-    %7 = quake.init_state %6, %4 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    %1 = cc.create_state %0, %c8_i64 : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+    %2 = cc.get_number_of_qubits %1 : (!cc.ptr<!cc.state>) -> i64
+    %3 = quake.alloca !quake.veq<?>[%2 : i64]
+    %4 = quake.init_state %3, %1 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
     return
   }
-  func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
-  cc.global constant @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00
-,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
-  func.func private @__nvqpp_cudaq_state_createFromData_fp32(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+  cc.global constant @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_state_param._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.array<complex<f32> x 8>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_2]] : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<3>
-// CHECK:           %[[VAL_4:.*]] = quake.init_state %[[VAL_3]], %[[VAL_2]] : (!quake.veq<3>, !cc.ptr<!cc.array<complex<f32> x 8>>) -> !quake.veq<3>
+// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<3>
+// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<3>, !cc.ptr<!cc.array<complex<f32> x 8>>) -> !quake.veq<3>
+// CHECK:           return
 // CHECK:         }
-// CHECK-DAG:     cc.global constant @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
+// CHECK-DAG:    cc.global constant @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 
   func.func @__nvqpp__mlirgen__sub_kernel(%arg : !cc.ptr<!cc.state>) attributes {"cudaq-kernel", no_this} {
-    %0 = call @__nvqpp_cudaq_state_numberOfQubits(%arg) : (!cc.ptr<!cc.state>) -> i64
+    %0 = cc.get_number_of_qubits %arg : (!cc.ptr<!cc.state>) -> i64
     %1 = quake.alloca !quake.veq<?>[%0 : i64]
     %2 = quake.init_state %1, %arg : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
     return
@@ -47,38 +38,28 @@ module {
   func.func @__nvqpp__mlirgen__function_test_state_param1._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %c8_i64 = arith.constant 8 : i64
     %0 = cc.address_of @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %1 = cc.load %0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %2 = cc.alloca !cc.array<complex<f32> x 8>
-    cc.store %1, %2 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %3 = cc.cast %2 : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-    %4 = call @__nvqpp_cudaq_state_createFromData_fp32(%3, %c8_i64) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-    call @__nvqpp__mlirgen__sub_kernel(%4) : (!cc.ptr<!cc.state>) -> ()
+    %1 = cc.create_state %0, %c8_i64 : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+    call @__nvqpp__mlirgen__sub_kernel(%1) : (!cc.ptr<!cc.state>) -> ()
     return
   }
 
   cc.global constant @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00
 ,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 
-// CHECK:         func.func @__nvqpp__mlirgen__sub_kernel(%[[VAL_ARG:.*]]: !cc.ptr<!cc.state>) attributes {"cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_ARG]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:         func.func @__nvqpp__mlirgen__sub_kernel(%arg0: !cc.ptr<!cc.state>) attributes {"cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = cc.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<?>[%[[VAL_0]] : i64]
-// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_ARG]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 // CHECK:           return
-// CHECK:          }
-
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_state_param1._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:         }
+// CHECK:         func.func @__nvqpp__mlirgen__function_test_state_param1._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 8 : i64
 // CHECK:           %[[VAL_1:.*]] = cc.address_of @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f32> x 8>
-// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = call @__nvqpp_cudaq_state_createFromData_fp32(%[[VAL_4]], %[[VAL_0]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           call @__nvqpp__mlirgen__sub_kernel(%[[VAL_5]]) : (!cc.ptr<!cc.state>) -> ()
-// CHECK:           call @__nvqpp_cudaq_state_delete(%[[VAL_5]]) : (!cc.ptr<!cc.state>) -> ()
+// CHECK:           %[[VAL_2:.*]] = cc.create_state %[[VAL_1]], %[[VAL_0]] : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           call @__nvqpp__mlirgen__sub_kernel(%[[VAL_2]]) : (!cc.ptr<!cc.state>) -> ()
+// CHECK:           call @__nvqpp_cudaq_state_delete(%[[VAL_2]]) : (!cc.ptr<!cc.state>) -> ()
 // CHECK:           return
 // CHECK:         }
-// CHECK-DAG:     constant @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
+// CHECK-DAG:     cc.global constant @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 // CHECK-DAG:     func.func private @__nvqpp_cudaq_state_delete(!cc.ptr<!cc.state>)
 }
-

From 102f8196fef4393441f42c13a40961c05ba34ea7 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 5 Nov 2024 09:51:04 -0800
Subject: [PATCH 2/6] Fix test_argument_conversion

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/test/test_argument_conversion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 1326ac4d39..7c8e9f4205 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -381,7 +381,7 @@ void test_state(mlir::MLIRContext *ctx) {
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
 // CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_2:.*]] = cc.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>>, i64) -> !cc.ptr<!cc.state>
 // CHECK:        }
 // CHECK-DAG:    cc.global constant @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
   // clang-format on

From 5ea1d973daf78890ee7f4ad2b780f9adca868d42 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 5 Nov 2024 10:00:57 -0800
Subject: [PATCH 3/6] Add printing in failing tests

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 python/tests/kernel/test_kernel_qvector_state_init.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/tests/kernel/test_kernel_qvector_state_init.py b/python/tests/kernel/test_kernel_qvector_state_init.py
index 18fa3914b3..c832cd6483 100644
--- a/python/tests/kernel/test_kernel_qvector_state_init.py
+++ b/python/tests/kernel/test_kernel_qvector_state_init.py
@@ -32,11 +32,18 @@ def test_kernel_synthesis_complex():
     def kernel(vec: cudaq.State):
         q = cudaq.qvector(vec)
 
+    counts = cudaq.sample(kernel, state)
+    print(f"Non-synthesized: ${counts}")
+    assert '00' in counts
+    assert '10' in counts
+    assert len(counts) == 2
+
     synthesized = cudaq.synthesize(kernel, state)
     counts = cudaq.sample(synthesized)
-    print(counts)
-    assert '10' in counts
+    print(f"Synthesized: ${counts}")
     assert '00' in counts
+    assert '10' in counts
+    assert len(counts) == 2
 
 
 # float

From 074c60f778f9dc49995199903d99fe3f83eff41b Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 5 Nov 2024 10:02:38 -0800
Subject: [PATCH 4/6] Add printing in failing tests

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 python/tests/kernel/test_kernel_qvector_state_init.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tests/kernel/test_kernel_qvector_state_init.py b/python/tests/kernel/test_kernel_qvector_state_init.py
index c832cd6483..64c1ef55d4 100644
--- a/python/tests/kernel/test_kernel_qvector_state_init.py
+++ b/python/tests/kernel/test_kernel_qvector_state_init.py
@@ -34,6 +34,7 @@ def kernel(vec: cudaq.State):
 
     counts = cudaq.sample(kernel, state)
     print(f"Non-synthesized: ${counts}")
+    print(kernel)
     assert '00' in counts
     assert '10' in counts
     assert len(counts) == 2
@@ -41,6 +42,7 @@ def kernel(vec: cudaq.State):
     synthesized = cudaq.synthesize(kernel, state)
     counts = cudaq.sample(synthesized)
     print(f"Synthesized: ${counts}")
+    print(synthesized)
     assert '00' in counts
     assert '10' in counts
     assert len(counts) == 2

From 310f6ca48e0f458b23accbb84125ecca0591b902 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 12 Nov 2024 10:06:42 -0800
Subject: [PATCH 5/6] Fix failing tests

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 lib/Optimizer/CodeGen/QuakeToCodegen.cpp      |  2 +-
 python/cudaq/kernel/ast_bridge.py             |  3 +++
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  1 +
 .../kernel/test_kernel_qvector_state_init.py  | 24 ++++++++-----------
 runtime/common/BaseRemoteRESTQPU.h            |  1 +
 5 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
index 6e913a2bec..6774847bf8 100644
--- a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
@@ -85,7 +85,7 @@ class CreateStateOpPattern : public OpRewritePattern<cudaq::cc::CreateStateOp> {
     auto is64Bit = isa<Float64Type>(eleTy);
 
     if (auto cTy = dyn_cast<ComplexType>(eleTy))
-      is64Bit = isa<Float64Type>(eleTy);
+      is64Bit = isa<Float64Type>(cTy.getElementType());
 
     auto createStateFunc = is64Bit ? cudaq::createCudaqStateFromDataFP64
                                    : cudaq::createCudaqStateFromDataFP32;
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 13d694d7bc..ffd930bf72 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -3829,6 +3829,9 @@ def visit_Name(self, node):
                 if cc.StdvecType.isinstance(eleTy):
                     self.pushValue(value)
                     return
+                if cc.StateType.isinstance(eleTy):
+                    self.pushValue(value)
+                    return
                 loaded = cc.LoadOp(value).result
                 self.pushValue(loaded)
             elif cc.CallableType.isinstance(
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 90ba42b617..b995f71f1a 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -535,6 +535,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   pm.addNestedPass<func::FuncOp>(
       cudaq::opt::createArgumentSynthesisPass(kernels, substs));
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  pm.addPass(opt::createDeleteStates());
 
   // Run state preparation for quantum devices (or their emulation) only.
   // Simulators have direct implementation of state initialization
diff --git a/python/tests/kernel/test_kernel_qvector_state_init.py b/python/tests/kernel/test_kernel_qvector_state_init.py
index 64c1ef55d4..84a3a603f1 100644
--- a/python/tests/kernel/test_kernel_qvector_state_init.py
+++ b/python/tests/kernel/test_kernel_qvector_state_init.py
@@ -33,16 +33,12 @@ def kernel(vec: cudaq.State):
         q = cudaq.qvector(vec)
 
     counts = cudaq.sample(kernel, state)
-    print(f"Non-synthesized: ${counts}")
-    print(kernel)
     assert '00' in counts
     assert '10' in counts
     assert len(counts) == 2
 
     synthesized = cudaq.synthesize(kernel, state)
     counts = cudaq.sample(synthesized)
-    print(f"Synthesized: ${counts}")
-    print(synthesized)
     assert '00' in counts
     assert '10' in counts
     assert len(counts) == 2
@@ -55,7 +51,7 @@ def kernel(vec: cudaq.State):
 def test_kernel_float_params_f64():
 
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     f = np.array([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)], dtype=float)
 
@@ -85,7 +81,7 @@ def test_kernel_float_params_f32():
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex_params_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=complex)
@@ -104,7 +100,7 @@ def kernel(vec: cudaq.State):
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex128_params_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=np.complex128)
@@ -123,7 +119,7 @@ def kernel(vec: cudaq.State):
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex64_params_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=np.complex64)
@@ -181,7 +177,7 @@ def test_kernel_complex_params_f32():
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex_capture_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=complex)
@@ -200,7 +196,7 @@ def kernel():
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex128_capture_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=np.complex128)
@@ -219,7 +215,7 @@ def kernel():
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex128_capture_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=np.complex64)
@@ -280,7 +276,7 @@ def test_kernel_complex_capture_f32():
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_simulation_dtype_complex_params_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=cudaq.complex())
@@ -318,7 +314,7 @@ def kernel(vec: cudaq.State):
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_simulation_dtype_capture_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=cudaq.complex())
@@ -359,7 +355,7 @@ def kernel():
 @skipIfNvidiaFP64NotInstalled
 def test_init_from_other_kernel_state_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     @cudaq.kernel
     def bell():
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 0834bc7e3e..84eb527ebb 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -444,6 +444,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         mlir::SmallVector<mlir::StringRef> substs = {substBuff};
         pm.addNestedPass<mlir::func::FuncOp>(
             opt::createArgumentSynthesisPass(kernels, substs));
+        pm.addPass(opt::createDeleteStates());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));

From a5a553eb47bb89bac829b28a43c92a2213996980 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 2 Dec 2024 10:33:28 -0800
Subject: [PATCH 6/6] Address CR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Dialect/CC/CCOps.td   | 62 --------------
 .../cudaq/Optimizer/Dialect/Quake/QuakeOps.td | 85 +++++++++++++++++++
 lib/Frontend/nvqpp/ConvertExpr.cpp            |  2 +-
 lib/Optimizer/CodeGen/QuakeToCodegen.cpp      | 31 +++++--
 lib/Optimizer/Transforms/DeleteStates.cpp     | 31 +++----
 python/cudaq/kernel/ast_bridge.py             |  4 +-
 python/cudaq/kernel/kernel_builder.py         |  6 +-
 runtime/common/ArgumentConversion.cpp         |  5 +-
 runtime/cudaq/builder/kernel_builder.cpp      |  2 +-
 runtime/test/test_argument_conversion.cpp     |  4 +-
 test/AST-Quake/qalloc_state.cpp               |  8 +-
 test/Quake/delete_states.qke                  | 47 +++++-----
 12 files changed, 164 insertions(+), 123 deletions(-)

diff --git a/include/cudaq/Optimizer/Dialect/CC/CCOps.td b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
index 1f1214f238..7b76067485 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCOps.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
@@ -899,68 +899,6 @@ def cc_AddressOfOp : CCOp<"address_of", [Pure,
   }];
 }
 
-def cc_CreateStateOp : CCOp<"create_state", [Pure] > {
-  let summary = "Create state from data";
-  let description = [{
-    This operation takes a pointer to state data and creates a quantum state.
-    The operation can be optimized away in DeleteStates pass, or replaced
-    by an intrinsic runtime call on simulators.
-
-    ```mlir
-      %0 = cc.create_state %data: !cc.ptr<!cc.state>
-    ```
-  }];
-
-  let arguments = (ins
-    AnyPointerType:$data,
-    AnySignlessInteger:$length
-  );
-  let results = (outs AnyPointerType:$result);
-  let assemblyFormat = [{
-      $data `,` $length `:` functional-type(operands, results) attr-dict
-  }];
-}
-
-def cc_GetNumberOfQubitsOp : CCOp<"get_number_of_qubits", [Pure] > {
-  let summary = "Get number of qubits from a quantum state";
-  let description = [{
-    This operation takes a state pointer argument and returns a number of
-    qubits in the state. The operation can be optimized away in some passes
-    line ReplaceStateByKernel or DeleteStates, or replaced by an intrinsic
-    runtime call on simulators.
-
-    ```mlir
-      %0 = cc.get_number_of_qubits %state : i64
-    ```
-  }];
-
-  let arguments = (ins cc_PointerType:$state);
-  let results = (outs AnySignlessInteger:$result);
-  let assemblyFormat = [{
-      $state `:` functional-type(operands, results) attr-dict
-  }];
-}
-
-def cc_GetStateOp : CCOp<"get_state", [Pure] > {
-  let summary = "Get state from kernel with the provided name.";
-  let description = [{
-    This operation is created by argument synthesis of state pointer arguments
-    for quantum devices. It takes a kernel name as ASCIIZ string literal value
-    and returns the kernel's quantum state. The operation is replaced by a call
-    to the kernel with the provided name in ReplaceStateByKernel pass.
-
-    ```mlir
-      %0 = cc.get_state "callee" : !cc.ptr<!cc.state>
-    ```
-  }];
-
-  let arguments = (ins StrAttr:$calleeName);
-  let results = (outs cc_PointerType:$result);
-  let assemblyFormat = [{
-     $calleeName `:` qualified(type(results)) attr-dict
-  }];
-}
-
 def cc_GlobalOp : CCOp<"global", [IsolatedFromAbove, Symbol]> {
   let summary = "Create a global constant or variable";
   let description = [{
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index 7fbf2f0257..87dd1f53a6 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -1397,4 +1397,89 @@ def CustomUnitarySymbolOp :
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Quantum states
+//===----------------------------------------------------------------------===//
+
+def quake_CreateStateOp : QuakeOp<"create_state", [Pure] > {
+  let summary = "Create state from data";
+  let description = [{
+    This operation takes a pointer to state data and creates a quantum state.
+    The operation can be optimized away in DeleteStates pass, or replaced
+    by an intrinsic runtime call on simulators.
+
+    ```mlir
+      %0 = quake.create_state %data %len: !cc.ptr<!cc.state>
+    ```
+  }];
+
+  let arguments = (ins
+    cc_PointerType:$data,
+    AnySignlessInteger:$length
+  );
+  let results = (outs cc_PointerType:$result);
+  let assemblyFormat = [{
+      $data `,` $length `:` functional-type(operands, results) attr-dict
+  }];
+}
+
+def QuakeOp_DeleteStateOp : QuakeOp<"delete_state", [] > {
+  let summary = "Delete quantum state";
+  let description =  [{
+    This operation takes a pointer to the state and deletes the state object.
+    The operation can be created in in DeleteStates pass, and replaced later
+    by an intrinsic runtime call on simulators.
+
+    ```mlir
+      quake.delete_state %state : (!cc.ptr<!cc.state>) -> ()
+    ```
+  }];
+
+  let arguments = (ins cc_PointerType:$state);
+  let results = (outs);
+  let assemblyFormat = [{
+      $state `:` functional-type(operands, results) attr-dict
+  }];
+}
+
+def quake_GetNumberOfQubitsOp : QuakeOp<"get_number_of_qubits", [Pure] > {
+  let summary = "Get number of qubits from a quantum state";
+  let description = [{
+    This operation takes a state pointer argument and returns a number of
+    qubits in the state. The operation can be optimized away in some passes
+    line ReplaceStateByKernel or DeleteStates, or replaced by an intrinsic
+    runtime call on simulators.
+
+    ```mlir
+      %0 = quake.get_number_of_qubits %state : (!cc.ptr<!cc.state>) -> i64
+    ```
+  }];
+
+  let arguments = (ins cc_PointerType:$state);
+  let results = (outs AnySignlessInteger:$result);
+  let assemblyFormat = [{
+      $state `:` functional-type(operands, results) attr-dict
+  }];
+}
+
+def QuakeOp_GetStateOp : QuakeOp<"get_state", [Pure] > {
+  let summary = "Get state from kernel with the provided name.";
+  let description = [{
+    This operation is created by argument synthesis of state pointer arguments
+    for quantum devices. It takes a kernel name as ASCIIZ string literal value
+    and returns the kernel's quantum state. The operation is replaced by a call
+    to the kernel with the provided name in ReplaceStateByKernel pass.
+
+    ```mlir
+      %0 = quake.get_state "callee" : !cc.ptr<!cc.state>
+    ```
+  }];
+
+  let arguments = (ins StrAttr:$calleeName);
+  let results = (outs cc_PointerType:$result);
+  let assemblyFormat = [{
+     $calleeName `:` qualified(type(results)) attr-dict
+  }];
+}
+
 #endif // CUDAQ_OPTIMIZER_DIALECT_QUAKE_OPS
diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
index b2dc1dacff..3e9b8dea95 100644
--- a/lib/Frontend/nvqpp/ConvertExpr.cpp
+++ b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -2700,7 +2700,7 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
           Value state = initials;
           auto i64Ty = builder.getI64Type();
           auto numQubits =
-              builder.create<cudaq::cc::GetNumberOfQubitsOp>(loc, i64Ty, state);
+              builder.create<quake::GetNumberOfQubitsOp>(loc, i64Ty, state);
           auto veqTy = quake::VeqType::getUnsized(ctx);
           Value alloc = builder.create<quake::AllocaOp>(loc, veqTy, numQubits);
           return pushValue(builder.create<quake::InitializeStateOp>(
diff --git a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
index 6774847bf8..e1483a133e 100644
--- a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
@@ -66,11 +66,11 @@ class ExpandComplexCast : public OpRewritePattern<cudaq::cc::CastOp> {
   }
 };
 
-class CreateStateOpPattern : public OpRewritePattern<cudaq::cc::CreateStateOp> {
+class CreateStateOpPattern : public OpRewritePattern<quake::CreateStateOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(cudaq::cc::CreateStateOp createStateOp,
+  LogicalResult matchAndRewrite(quake::CreateStateOp createStateOp,
                                 PatternRewriter &rewriter) const override {
     auto module = createStateOp->getParentOfType<ModuleOp>();
     auto loc = createStateOp.getLoc();
@@ -104,12 +104,33 @@ class CreateStateOpPattern : public OpRewritePattern<cudaq::cc::CreateStateOp> {
   }
 };
 
+class DeleteStateOpPattern : public OpRewritePattern<quake::DeleteStateOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::DeleteStateOp deleteStateOp,
+                                PatternRewriter &rewriter) const override {
+    auto module = deleteStateOp->getParentOfType<ModuleOp>();
+    auto ctx = deleteStateOp.getContext();
+    auto state = deleteStateOp.getOperand();
+
+    cudaq::IRBuilder irBuilder(ctx);
+    auto result = irBuilder.loadIntrinsic(module, cudaq::deleteCudaqState);
+    assert(succeeded(result) && "loading intrinsic should never fail");
+
+    rewriter.replaceOpWithNewOp<func::CallOp>(deleteStateOp, std::nullopt,
+                                              cudaq::deleteCudaqState,
+                                              mlir::ValueRange{state});
+    return success();
+  }
+};
+
 class GetNumberOfQubitsOpPattern
-    : public OpRewritePattern<cudaq::cc::GetNumberOfQubitsOp> {
+    : public OpRewritePattern<quake::GetNumberOfQubitsOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(cudaq::cc::GetNumberOfQubitsOp getNumQubitsOp,
+  LogicalResult matchAndRewrite(quake::GetNumberOfQubitsOp getNumQubitsOp,
                                 PatternRewriter &rewriter) const override {
     auto module = getNumQubitsOp->getParentOfType<ModuleOp>();
     auto ctx = getNumQubitsOp.getContext();
@@ -133,5 +154,5 @@ void cudaq::codegen::populateQuakeToCodegenPatterns(
     mlir::RewritePatternSet &patterns) {
   auto *ctx = patterns.getContext();
   patterns.insert<CodeGenRAIIPattern, ExpandComplexCast, CreateStateOpPattern,
-                  GetNumberOfQubitsOpPattern>(ctx);
+                  DeleteStateOpPattern, GetNumberOfQubitsOpPattern>(ctx);
 }
diff --git a/lib/Optimizer/Transforms/DeleteStates.cpp b/lib/Optimizer/Transforms/DeleteStates.cpp
index 74b3a432c2..a6fde45d72 100644
--- a/lib/Optimizer/Transforms/DeleteStates.cpp
+++ b/lib/Optimizer/Transforms/DeleteStates.cpp
@@ -31,7 +31,7 @@ using namespace mlir;
 namespace {
 /// For a `cc:CreateStateOp`, get the number of qubits allocated.
 static std::size_t getStateSize(Operation *op) {
-  if (auto createStateOp = dyn_cast<cudaq::cc::CreateStateOp>(op)) {
+  if (auto createStateOp = dyn_cast<quake::CreateStateOp>(op)) {
     auto sizeOperand = createStateOp.getOperand(1);
     auto defOp = sizeOperand.getDefiningOp();
     while (defOp && !dyn_cast<arith::ConstantIntOp>(defOp))
@@ -44,28 +44,27 @@ static std::size_t getStateSize(Operation *op) {
 }
 
 // clang-format off
-/// Replace `cc.get_number_of_qubits` by a constant.
+/// Replace `quake.get_number_of_qubits` by a constant.
 /// ```
 /// %c8_i64 = arith.constant 8 : i64
-/// %2 = cc.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-/// %3 = cc.get_number_of_qubits %2 : i64
+/// %2 = quake.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+/// %3 = quake.get_number_of_qubits %2 : i64
 /// ...
 /// ───────────────────────────────────────────
 /// %c8_i64 = arith.constant 8 : i64
-/// %2 = cc.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+/// %2 = quake.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
 /// %3 = arith.constant 3 : i64
 /// ```
 // clang-format on
 class NumberOfQubitsPattern
-    : public OpRewritePattern<cudaq::cc::GetNumberOfQubitsOp> {
+    : public OpRewritePattern<quake::GetNumberOfQubitsOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(cudaq::cc::GetNumberOfQubitsOp op,
+  LogicalResult matchAndRewrite(quake::GetNumberOfQubitsOp op,
                                 PatternRewriter &rewriter) const override {
     auto stateOp = op.getOperand();
-    if (auto createStateOp =
-            stateOp.getDefiningOp<cudaq::cc::CreateStateOp>()) {
+    if (auto createStateOp = stateOp.getDefiningOp<quake::CreateStateOp>()) {
       auto size = getStateSize(createStateOp);
       rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(
           op, std::countr_zero(size), rewriter.getI64Type());
@@ -76,11 +75,11 @@ class NumberOfQubitsPattern
 };
 
 // clang-format off
-/// Remove `cc.create_state` instructions and pass their data directly to
+/// Remove `quake.create_state` instructions and pass their data directly to
 /// the `quake.state_init` instruction instead.
 /// ```
 /// %2 = cc.cast %1 : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-/// %3 = cc.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+/// %3 = quake.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
 /// %4 = quake.alloca !quake.veq<?>[%0 : i64]
 /// %5 = quake.init_state %4, %3 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 /// ───────────────────────────────────────────
@@ -99,7 +98,7 @@ class StateToDataPattern : public OpRewritePattern<quake::InitializeStateOp> {
     auto state = initState.getOperand(1);
     auto targets = initState.getTargets();
 
-    if (auto createStateOp = state.getDefiningOp<cudaq::cc::CreateStateOp>()) {
+    if (auto createStateOp = state.getDefiningOp<quake::CreateStateOp>()) {
       auto dataOp = createStateOp->getOperand(0);
       if (auto cast = dataOp.getDefiningOp<cudaq::cc::CastOp>())
         dataOp = cast.getOperand();
@@ -138,7 +137,7 @@ class DeleteStatesPass
       llvm::SmallVector<Operation *> usedStates;
 
       func.walk([&](Operation *op) {
-        if (isa<cudaq::cc::CreateStateOp>(op)) {
+        if (isa<quake::CreateStateOp>(op)) {
           if (!op->getUses().empty())
             usedStates.push_back(op);
         }
@@ -157,10 +156,8 @@ class DeleteStatesPass
 
             builder.setInsertionPoint(op);
             for (auto createStateOp : usedStates) {
-              auto result = cast<cudaq::cc::CreateStateOp>(createStateOp);
-              builder.create<func::CallOp>(loc, std::nullopt,
-                                           cudaq::deleteCudaqState,
-                                           mlir::ValueRange{result});
+              auto result = cast<quake::CreateStateOp>(createStateOp);
+              builder.create<quake::DeleteStateOp>(loc, result);
             }
           }
         });
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index e25afe020c..2e64da5ec5 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -2247,8 +2247,8 @@ def bodyBuilder(iterVal):
                         statePtr = self.ifNotPointerThenStore(valueOrPtr)
 
                         i64Ty = self.getIntegerType()
-                        numQubits = cc.GetNumberOfQubitsOp(i64Ty,
-                                                           statePtr).result
+                        numQubits = quake.GetNumberOfQubitsOp(i64Ty,
+                                                              statePtr).result
 
                         veqTy = quake.VeqType.get(self.ctx)
                         qubits = quake.AllocaOp(veqTy, size=numQubits).result
diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index ffc8aed541..fff1d1a30c 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -778,7 +778,7 @@ def qalloc(self, initializer=None):
                 statePtr = self.capturedDataStorage.storeCudaqState(initializer)
 
                 i64Ty = self.getIntegerType()
-                numQubits = cc.GetNumberOfQubitsOp(i64Ty, statePtr).result
+                numQubits = quake.GetNumberOfQubitsOp(i64Ty, statePtr).result
 
                 veqTy = quake.VeqType.get(self.ctx)
                 qubits = quake.AllocaOp(veqTy, size=numQubits).result
@@ -815,8 +815,8 @@ def qalloc(self, initializer=None):
                         statePtr = initializer.mlirValue
 
                         i64Ty = self.getIntegerType()
-                        numQubits = cc.GetNumberOfQubitsOp(i64Ty,
-                                                           statePtr).result
+                        numQubits = quake.GetNumberOfQubitsOp(i64Ty,
+                                                              statePtr).result
 
                         veqTy = quake.VeqType.get(self.ctx)
                         qubits = quake.AllocaOp(veqTy, size=numQubits).result
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 233d8b7d02..3fe902a2e9 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -10,6 +10,7 @@
 #include "cudaq.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Todo.h"
 #include "cudaq/qis/pauli_word.h"
 #include "cudaq/utils/registry.h"
@@ -142,8 +143,8 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto stateTy = cudaq::cc::StateType::get(ctx);
     auto statePtrTy = cudaq::cc::PointerType::get(stateTy);
 
-    return builder.create<cudaq::cc::CreateStateOp>(loc, statePtrTy, buffer,
-                                                    arrSize);
+    return builder.create<quake::CreateStateOp>(loc, statePtrTy, buffer,
+                                                arrSize);
   }
   // The program is executed on quantum hardware, state data is not
   // available and needs to be regenerated.
diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp
index ebf10a6978..99c492d65a 100644
--- a/runtime/cudaq/builder/kernel_builder.cpp
+++ b/runtime/cudaq/builder/kernel_builder.cpp
@@ -514,7 +514,7 @@ QuakeValue qalloc(ImplicitLocOpBuilder &builder, QuakeValue &sizeOrVec) {
     auto eleTy = statePtrTy.getElementType();
     if (auto stateTy = dyn_cast<cc::StateType>(eleTy)) {
       // get the number of qubits
-      auto numQubits = builder.create<cudaq::cc::GetNumberOfQubitsOp>(
+      auto numQubits = builder.create<quake::GetNumberOfQubitsOp>(
           builder.getI64Type(), value);
       // allocate the number of qubits we need
       auto veqTy = quake::VeqType::getUnsized(context);
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 1d6265837f..d40e00ba30 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -378,7 +378,7 @@ void test_state(mlir::MLIRContext *ctx) {
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
 // CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_2:.*]] = quake.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>>, i64) -> !cc.ptr<!cc.state>
 // CHECK:        }
 // CHECK-DAG:    cc.global constant private @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
   // clang-format on
@@ -482,7 +482,7 @@ void test_combinations(mlir::MLIRContext *ctx) {
 // CHECK-LABEL:   cc.arg_subst[1] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
 // CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_5:.*]] = cc.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_5:.*]] = quake.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>>, i64) -> !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK-DAG:     cc.global constant private @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
 // CHECK-LABEL:   cc.arg_subst[2] {
diff --git a/test/AST-Quake/qalloc_state.cpp b/test/AST-Quake/qalloc_state.cpp
index 822f1e1f56..2dbfeac8bf 100644
--- a/test/AST-Quake/qalloc_state.cpp
+++ b/test/AST-Quake/qalloc_state.cpp
@@ -20,7 +20,7 @@ struct Eins {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Eins(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
@@ -34,7 +34,7 @@ struct Zwei {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Zwei(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
@@ -48,7 +48,7 @@ struct Drei {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Drei(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
@@ -62,7 +62,7 @@ struct Vier {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Vier(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
diff --git a/test/Quake/delete_states.qke b/test/Quake/delete_states.qke
index 87b6747458..d84a672f13 100644
--- a/test/Quake/delete_states.qke
+++ b/test/Quake/delete_states.qke
@@ -9,57 +9,56 @@
 // RUN: cudaq-opt -delete-states -canonicalize %s | FileCheck %s
 
 module {
-  func.func @__nvqpp__mlirgen__function_test_state_param._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  func.func @test_state_param() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %c8_i64 = arith.constant 8 : i64
-    %0 = cc.address_of @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %1 = cc.create_state %0, %c8_i64 : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
-    %2 = cc.get_number_of_qubits %1 : (!cc.ptr<!cc.state>) -> i64
+    %0 = cc.address_of @test_state_param.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
+    %1 = quake.create_state %0, %c8_i64 : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+    %2 = quake.get_number_of_qubits %1 : (!cc.ptr<!cc.state>) -> i64
     %3 = quake.alloca !quake.veq<?>[%2 : i64]
     %4 = quake.init_state %3, %1 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
     return
   }
-  cc.global constant private @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
+  cc.global constant private @test_state_param.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_state_param._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = cc.address_of @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
+// CHECK-LABEL:   func.func @test_state_param() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = cc.address_of @test_state_param.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<3>
 // CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<3>, !cc.ptr<!cc.array<complex<f32> x 8>>) -> !quake.veq<3>
 // CHECK:           return
 // CHECK:         }
-// CHECK-DAG:    cc.global constant private @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
+// CHECK-DAG:    cc.global constant private @test_state_param.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 
-  func.func @__nvqpp__mlirgen__sub_kernel(%arg : !cc.ptr<!cc.state>) attributes {"cudaq-kernel", no_this} {
-    %0 = cc.get_number_of_qubits %arg : (!cc.ptr<!cc.state>) -> i64
+  func.func @sub_kernel(%arg : !cc.ptr<!cc.state>) attributes {"cudaq-kernel", no_this} {
+    %0 = quake.get_number_of_qubits %arg : (!cc.ptr<!cc.state>) -> i64
     %1 = quake.alloca !quake.veq<?>[%0 : i64]
     %2 = quake.init_state %1, %arg : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
     return
   }
 
-  func.func @__nvqpp__mlirgen__function_test_state_param1._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  func.func @test_state_param1() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %c8_i64 = arith.constant 8 : i64
-    %0 = cc.address_of @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %1 = cc.create_state %0, %c8_i64 : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
-    call @__nvqpp__mlirgen__sub_kernel(%1) : (!cc.ptr<!cc.state>) -> ()
+    %0 = cc.address_of @test_state_param1.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
+    %1 = quake.create_state %0, %c8_i64 : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+    call @sub_kernel(%1) : (!cc.ptr<!cc.state>) -> ()
     return
   }
 
-  cc.global constant private @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00
+  cc.global constant private @test_state_param1.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00
 ,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 
-// CHECK:         func.func @__nvqpp__mlirgen__sub_kernel(%arg0: !cc.ptr<!cc.state>) attributes {"cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = cc.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
+// CHECK:         func.func @sub_kernel(%arg0: !cc.ptr<!cc.state>) attributes {"cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = quake.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<?>[%[[VAL_0]] : i64]
 // CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 // CHECK:           return
 // CHECK:         }
-// CHECK:         func.func @__nvqpp__mlirgen__function_test_state_param1._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:         func.func @test_state_param1() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_1:.*]] = cc.address_of @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_2:.*]] = cc.create_state %[[VAL_1]], %[[VAL_0]] : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           call @__nvqpp__mlirgen__sub_kernel(%[[VAL_2]]) : (!cc.ptr<!cc.state>) -> ()
-// CHECK:           call @__nvqpp_cudaq_state_delete(%[[VAL_2]]) : (!cc.ptr<!cc.state>) -> ()
+// CHECK:           %[[VAL_1:.*]] = cc.address_of @test_state_param1.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
+// CHECK:           %[[VAL_2:.*]] = quake.create_state %[[VAL_1]], %[[VAL_0]] : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           call @sub_kernel(%[[VAL_2]]) : (!cc.ptr<!cc.state>) -> ()
+// CHECK:           quake.delete_state %[[VAL_2]] : (!cc.ptr<!cc.state>) -> ()
 // CHECK:           return
 // CHECK:         }
-// CHECK-DAG:     cc.global constant private @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
-// CHECK-DAG:     func.func private @__nvqpp_cudaq_state_delete(!cc.ptr<!cc.state>)
+// CHECK-DAG:     cc.global constant private @test_state_param1.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 }