quic · smedhe · Aug 20, 2025
@@ -33,6 +33,9 @@
    "outputs": [],
    "source": [
     "# Initiate the Original Transformer model\n",
+    "# Initiate the tokenizer for transformers library\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
     "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
     "\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
@@ -92,11 +95,7 @@
     "# Compile the model for provided compilation arguments\n",
     "# Please use platform SDK to Check num_cores for your card.\n",
     "\n",
-    "qeff_model.compile(\n",
-    "    num_cores=14,\n",
-    "    mxfp6=True,\n",
-    "    device_group=[0],\n",
-    ")"
+    "qeff_model.compile(num_cores=14, mxfp6_matmul=True)"
    ]
   },
   {
@@ -116,8 +115,8 @@
    "source": [
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
-    "\n",
-    "qeff_model.generate(prompts=[\"My name is\"])"
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)"
    ]
   }
  ],

@@ -32,6 +32,8 @@
    "outputs": [],
    "source": [
     "# Initiate the Original Transformer model\n",
+    "# Initiate the tokenizer for transformers library\n",
+    "from transformers import AutoTokenizer\n",
     "\n",
     "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
     "\n",
@@ -91,11 +93,7 @@
     "# Compile the model for provided compilation arguments\n",
     "# Please use platform SDK to Check num_cores for your card.\n",
     "\n",
-    "qeff_model.compile(\n",
-    "    num_cores=14,\n",
-    "    mxfp6=True,\n",
-    "    device_group=[0],\n",
-    ")"
+    "qeff_model.compile(num_cores=14, mxfp6_matmul=True)"
    ]
   },
   {
@@ -116,7 +114,8 @@
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
-    "qeff_model.generate(prompts=[\"My name is\"])"
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)"
    ]
   }
  ],