From b7638b038ced93fe061b9e44191b50b14dfa1c39 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Sun, 18 May 2025 17:42:00 +0300
Subject: [PATCH 01/37] wip // max error rate in scheduler

---
 src/guidellm/scheduler/result.py    |  2 ++
 src/guidellm/scheduler/scheduler.py | 23 +++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
index 0f12687f..9d379422 100644
--- a/src/guidellm/scheduler/result.py
+++ b/src/guidellm/scheduler/result.py
@@ -46,12 +46,14 @@ class SchedulerRunInfo(StandardBaseModel):
     end_number: float
     processes: int
     strategy: SchedulingStrategy
+    max_error_rate: float
 
     created_requests: int = 0
     queued_requests: int = 0
     scheduled_requests: int = 0
     processing_requests: int = 0
     completed_requests: int = 0
+    errored_requests: int = 0
 
 
 class SchedulerRequestInfo(StandardBaseModel):
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 06203827..33204729 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -64,12 +64,14 @@ def __init__(
 
         self.worker = worker
         self.request_loader = request_loader
+        self.error_rate: Optional[float] = None
 
     async def run(
         self,
         scheduling_strategy: SchedulingStrategy,
         max_number: Optional[int] = None,
         max_duration: Optional[float] = None,
+        max_error_rate: Optional[float] = 0.05,
     ) -> AsyncGenerator[
         Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None
     ]:
@@ -98,6 +100,8 @@ async def run(
         :param max_duration: The maximum duration for the scheduling run.
             If None, then no limit is set and either the iterator must be exhaustible
             or the max_number must be set.
+        :param max_error_rate: The maximum error rate after which the scheduler shuts down.
+            If not provided a default of 5% i.e 0.05 is used.
         :return: An asynchronous generator that yields SchedulerResult objects.
             Each SchedulerResult object contains information about the request,
             the response, and the run information.
@@ -109,9 +113,12 @@ async def run(
 
         if max_number is not None and max_number < 1:
             raise ValueError(f"Invalid max_number: {max_number}")
-
         if max_duration is not None and max_duration < 0:
             raise ValueError(f"Invalid max_duration: {max_duration}")
+        if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1):
+            raise ValueError(f"Invalid max_error_rate: {max_error_rate}")
+
+        shutdown_event = multiprocessing.Event()
 
         with (
             multiprocessing.Manager() as manager,
@@ -124,7 +131,7 @@ async def run(
                 manager, executor, scheduling_strategy
             )
             run_info, requests_iter, times_iter = self._run_setup(
-                futures, scheduling_strategy, max_number, max_duration
+                futures, scheduling_strategy, max_number, max_duration, max_error_rate
             )
             yield SchedulerResult(
                 type_="run_start",
@@ -159,6 +166,8 @@ async def run(
                         run_info,
                     )
                     if iter_result is not None:
+                        if self._is_max_error_rate_reached(iter_result.run_info):
+                            logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!")
                         yield iter_result
 
                     # yield control to the event loop
@@ -249,6 +258,7 @@ def _run_setup(
         scheduling_strategy: SchedulingStrategy,
         max_number: Optional[int],
         max_duration: Optional[float],
+        max_error_rate: Optional[float],
     ) -> tuple[SchedulerRunInfo, Iterator[Any], Iterator[float]]:
         requests_iter = iter(self.request_loader)
         start_time = time.time()
@@ -276,6 +286,7 @@ def _run_setup(
             end_number=end_number,
             processes=len(processes),
             strategy=scheduling_strategy,
+            max_error_rate=max_error_rate
         )
 
         return info, requests_iter, times_iter
@@ -362,6 +373,9 @@ def _check_result_ready(
             run_info.processing_requests -= 1
             run_info.completed_requests += 1
 
+            if process_response.info.errored:
+                run_info.errored_requests += 1
+
             return SchedulerRequestResult(
                 type_="request_complete",
                 run_info=run_info,
@@ -371,6 +385,11 @@ def _check_result_ready(
             )
         raise ValueError(f"Invalid process response type: {process_response}")
 
+    @staticmethod
+    def _is_max_error_rate_reached(run_info: SchedulerRunInfo) -> bool:
+        current_error_rate = run_info.errored_requests / run_info.end_number
+        return current_error_rate > run_info.max_error_rate
+
     async def _stop_processes(
         self,
         futures: list[asyncio.Future],

From 6059af183ebed636af5a2a7eed4707d943f8e7db Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Mon, 19 May 2025 11:41:33 +0300
Subject: [PATCH 02/37] wip

---
 src/guidellm/scheduler/scheduler.py | 18 ++++++++++--------
 src/guidellm/scheduler/worker.py    |  9 +++++++--
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 33204729..cd9231af 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -64,7 +64,6 @@ def __init__(
 
         self.worker = worker
         self.request_loader = request_loader
-        self.error_rate: Optional[float] = None
 
     async def run(
         self,
@@ -118,8 +117,6 @@ async def run(
         if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1):
             raise ValueError(f"Invalid max_error_rate: {max_error_rate}")
 
-        shutdown_event = multiprocessing.Event()
-
         with (
             multiprocessing.Manager() as manager,
             ProcessPoolExecutor(
@@ -127,7 +124,7 @@ async def run(
             ) as executor,
         ):
             requests_iter: Optional[Iterator[Any]] = None
-            futures, requests_queue, responses_queue = await self._start_processes(
+            futures, requests_queue, responses_queue, shutdown_event = await self._start_processes(
                 manager, executor, scheduling_strategy
             )
             run_info, requests_iter, times_iter = self._run_setup(
@@ -167,7 +164,9 @@ async def run(
                     )
                     if iter_result is not None:
                         if self._is_max_error_rate_reached(iter_result.run_info):
-                            logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!")
+                            logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached, sending "
+                                        f"shutdown signal")
+                            shutdown_event.set()
                         yield iter_result
 
                     # yield control to the event loop
@@ -191,8 +190,10 @@ async def _start_processes(
         list[asyncio.Future],
         multiprocessing.Queue,
         multiprocessing.Queue,
+        multiprocessing.Event
     ]:
         await self.worker.prepare_multiprocessing()
+        shutdown_event = multiprocessing.Event()
         requests_queue = manager.Queue(
             maxsize=scheduling_strategy.queued_requests_limit
         )
@@ -229,6 +230,7 @@ async def _start_processes(
                         requests_queue,
                         responses_queue,
                         id_,
+                        shutdown_event,
                     )
                 )
             elif scheduling_strategy.processing_mode == "async":
@@ -240,6 +242,7 @@ async def _start_processes(
                         responses_queue,
                         requests_limit,
                         id_,
+                        shutdown_event,
                     )
                 )
             else:
@@ -250,7 +253,7 @@ async def _start_processes(
 
         await asyncio.sleep(0.1)  # give time for processes to start
 
-        return futures, requests_queue, responses_queue
+        return futures, requests_queue, responses_queue, shutdown_event
 
     def _run_setup(
         self,
@@ -385,8 +388,7 @@ def _check_result_ready(
             )
         raise ValueError(f"Invalid process response type: {process_response}")
 
-    @staticmethod
-    def _is_max_error_rate_reached(run_info: SchedulerRunInfo) -> bool:
+    def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool:
         current_error_rate = run_info.errored_requests / run_info.end_number
         return current_error_rate > run_info.max_error_rate
 
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index a53b14c2..2dfd4462 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -121,9 +121,13 @@ async def resolve(
         ...
 
     async def get_request(
-        self, requests_queue: multiprocessing.Queue
+        self, requests_queue: multiprocessing.Queue, shutdown_event: multiprocessing.Event, shutdonen_check_
     ) -> Optional[WorkerProcessRequest[RequestT]]:
-        return await asyncio.to_thread(requests_queue.get)  # type: ignore[attr-defined]
+        def _get_queue_intermittently(request_queue: multiprocessing.Queue, shutdown_event):
+            try:
+                
+
+        return await asyncio.to_thread(_get_queue_intermittently())  # type: ignore[attr-defined]
 
     async def send_result(
         self,
@@ -222,6 +226,7 @@ def process_loop_asynchronous(
         results_queue: multiprocessing.Queue,
         max_concurrency: int,
         process_id: int,
+        shutdown_event: multiprocessing.Event,
     ):
         async def _process_runner():
             pending = asyncio.Semaphore(max_concurrency)

From 69a5c9eb5b5a272a0821dfdf569ed0acf5bcaffe Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Mon, 19 May 2025 13:51:05 +0300
Subject: [PATCH 03/37] Revert "wip"

This reverts commit 6059af183ebed636af5a2a7eed4707d943f8e7db.
---
 src/guidellm/scheduler/scheduler.py | 18 ++++++++----------
 src/guidellm/scheduler/worker.py    |  9 ++-------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index cd9231af..33204729 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -64,6 +64,7 @@ def __init__(
 
         self.worker = worker
         self.request_loader = request_loader
+        self.error_rate: Optional[float] = None
 
     async def run(
         self,
@@ -117,6 +118,8 @@ async def run(
         if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1):
             raise ValueError(f"Invalid max_error_rate: {max_error_rate}")
 
+        shutdown_event = multiprocessing.Event()
+
         with (
             multiprocessing.Manager() as manager,
             ProcessPoolExecutor(
@@ -124,7 +127,7 @@ async def run(
             ) as executor,
         ):
             requests_iter: Optional[Iterator[Any]] = None
-            futures, requests_queue, responses_queue, shutdown_event = await self._start_processes(
+            futures, requests_queue, responses_queue = await self._start_processes(
                 manager, executor, scheduling_strategy
             )
             run_info, requests_iter, times_iter = self._run_setup(
@@ -164,9 +167,7 @@ async def run(
                     )
                     if iter_result is not None:
                         if self._is_max_error_rate_reached(iter_result.run_info):
-                            logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached, sending "
-                                        f"shutdown signal")
-                            shutdown_event.set()
+                            logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!")
                         yield iter_result
 
                     # yield control to the event loop
@@ -190,10 +191,8 @@ async def _start_processes(
         list[asyncio.Future],
         multiprocessing.Queue,
         multiprocessing.Queue,
-        multiprocessing.Event
     ]:
         await self.worker.prepare_multiprocessing()
-        shutdown_event = multiprocessing.Event()
         requests_queue = manager.Queue(
             maxsize=scheduling_strategy.queued_requests_limit
         )
@@ -230,7 +229,6 @@ async def _start_processes(
                         requests_queue,
                         responses_queue,
                         id_,
-                        shutdown_event,
                     )
                 )
             elif scheduling_strategy.processing_mode == "async":
@@ -242,7 +240,6 @@ async def _start_processes(
                         responses_queue,
                         requests_limit,
                         id_,
-                        shutdown_event,
                     )
                 )
             else:
@@ -253,7 +250,7 @@ async def _start_processes(
 
         await asyncio.sleep(0.1)  # give time for processes to start
 
-        return futures, requests_queue, responses_queue, shutdown_event
+        return futures, requests_queue, responses_queue
 
     def _run_setup(
         self,
@@ -388,7 +385,8 @@ def _check_result_ready(
             )
         raise ValueError(f"Invalid process response type: {process_response}")
 
-    def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool:
+    @staticmethod
+    def _is_max_error_rate_reached(run_info: SchedulerRunInfo) -> bool:
         current_error_rate = run_info.errored_requests / run_info.end_number
         return current_error_rate > run_info.max_error_rate
 
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index 2dfd4462..a53b14c2 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -121,13 +121,9 @@ async def resolve(
         ...
 
     async def get_request(
-        self, requests_queue: multiprocessing.Queue, shutdown_event: multiprocessing.Event, shutdonen_check_
+        self, requests_queue: multiprocessing.Queue
     ) -> Optional[WorkerProcessRequest[RequestT]]:
-        def _get_queue_intermittently(request_queue: multiprocessing.Queue, shutdown_event):
-            try:
-                
-
-        return await asyncio.to_thread(_get_queue_intermittently())  # type: ignore[attr-defined]
+        return await asyncio.to_thread(requests_queue.get)  # type: ignore[attr-defined]
 
     async def send_result(
         self,
@@ -226,7 +222,6 @@ def process_loop_asynchronous(
         results_queue: multiprocessing.Queue,
         max_concurrency: int,
         process_id: int,
-        shutdown_event: multiprocessing.Event,
     ):
         async def _process_runner():
             pending = asyncio.Semaphore(max_concurrency)

From 7795d2c23b0e3449506764102343eafbe486c5a6 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Mon, 19 May 2025 14:28:32 +0300
Subject: [PATCH 04/37] Handle infinite datasets with constant rate

---
 src/guidellm/request/loader.py      |  8 +++++++-
 src/guidellm/scheduler/result.py    |  2 +-
 src/guidellm/scheduler/scheduler.py | 17 ++++++++++++++---
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py
index 50ab3cca..0e54fc45 100644
--- a/src/guidellm/request/loader.py
+++ b/src/guidellm/request/loader.py
@@ -21,9 +21,14 @@
     "GenerativeRequestLoaderDescription",
     "RequestLoader",
     "RequestLoaderDescription",
+    "InfiniteDatasetError"
 ]
 
 
+class InfiniteDatasetError(Exception):
+    pass
+
+
 class RequestLoaderDescription(StandardBaseModel):
     type_: Literal["request_loader"] = "request_loader"
 
@@ -120,7 +125,8 @@ def __len__(self) -> int:
         if self.iter_type == "finite":
             return self.num_unique_items()
 
-        raise ValueError(f"Unable to determine length of dataset: {self.data}")
+        assert self.iter_type == "infinite"
+        raise InfiniteDatasetError(f"Dataset {self.data} is infinite and thus unable to determine length")
 
     @property
     def description(self) -> GenerativeRequestLoaderDescription:
diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
index 9d379422..a340932d 100644
--- a/src/guidellm/scheduler/result.py
+++ b/src/guidellm/scheduler/result.py
@@ -43,7 +43,7 @@ class SchedulerRunInfo(StandardBaseModel):
 
     start_time: float
     end_time: float
-    end_number: float
+    end_number: float  # ToDo: Rename to max_requests & change to int (check all references before)
     processes: int
     strategy: SchedulingStrategy
     max_error_rate: float
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 33204729..d0d06a4a 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -15,6 +15,7 @@
 from loguru import logger
 
 from guidellm.config import settings
+from guidellm.request.loader import InfiniteDatasetError
 from guidellm.scheduler.result import (
     SchedulerRequestResult,
     SchedulerResult,
@@ -166,8 +167,12 @@ async def run(
                         run_info,
                     )
                     if iter_result is not None:
-                        if self._is_max_error_rate_reached(iter_result.run_info):
-                            logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!")
+                        if iter_result.request_info.errored:
+                            if self._is_max_error_rate_reached(iter_result.run_info):
+                                logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!")
+                            else:
+                                cur_error_rate = iter_result.run_info.errored_requests / iter_result.run_info.end_number
+                                logger.debug(f"Current error rate {cur_error_rate}")
                         yield iter_result
 
                     # yield control to the event loop
@@ -271,7 +276,13 @@ def _run_setup(
             iter_length = len(self.request_loader)  # type: ignore[arg-type]
             if 0 < iter_length < end_number:
                 end_number = iter_length
-        except Exception:  # noqa: BLE001, S110
+        except InfiniteDatasetError:  # noqa: BLE001, S110
+            if scheduling_strategy.type_ == "constant" and max_duration is not None:
+                end_number = scheduling_strategy.rate * max_duration
+            else:
+                # ToDo: Maybe add poison?
+                raise
+        except Exception:
             pass
 
         if end_number == math.inf and end_time is None:

From 6d688f0bdbcb01b1735fd77971e2c82a28a38e32 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Mon, 19 May 2025 11:41:33 +0300
Subject: [PATCH 05/37] minor bug fixes

---
 src/guidellm/benchmark/benchmark.py |  2 +-
 src/guidellm/scheduler/scheduler.py | 42 +++++++++++++++-------
 src/guidellm/scheduler/worker.py    | 54 ++++++++++++++++++++++++-----
 3 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
index 4e2e09a3..50d2f49c 100644
--- a/src/guidellm/benchmark/benchmark.py
+++ b/src/guidellm/benchmark/benchmark.py
@@ -701,7 +701,7 @@ def from_stats(
             *["incomplete"] * len(incomplete),  # type: ignore[list-item]
             *["error"] * len(errored),  # type: ignore[list-item]
         ]
-        start_time = min(req.start_time for req in total)
+        start_time = min(req.start_time for req in total) # ToDo: Fix if total is empty
         end_time = max(req.end_time for req in total)
 
         total_with_prompt, total_types_with_prompt = (
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index d0d06a4a..c58ef363 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -72,7 +72,7 @@ async def run(
         scheduling_strategy: SchedulingStrategy,
         max_number: Optional[int] = None,
         max_duration: Optional[float] = None,
-        max_error_rate: Optional[float] = 0.05,
+        max_error_rate: Optional[float] = 0,
     ) -> AsyncGenerator[
         Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None
     ]:
@@ -119,8 +119,6 @@ async def run(
         if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1):
             raise ValueError(f"Invalid max_error_rate: {max_error_rate}")
 
-        shutdown_event = multiprocessing.Event()
-
         with (
             multiprocessing.Manager() as manager,
             ProcessPoolExecutor(
@@ -128,9 +126,11 @@ async def run(
             ) as executor,
         ):
             requests_iter: Optional[Iterator[Any]] = None
-            futures, requests_queue, responses_queue = await self._start_processes(
-                manager, executor, scheduling_strategy
+            futures, requests_queue, responses_queue, shutdown_event = await self._start_processes(
+                manager, executor, scheduling_strategy, max_error_rate is not None
             )
+            if shutdown_event:
+                assert not shutdown_event.is_set()
             run_info, requests_iter, times_iter = self._run_setup(
                 futures, scheduling_strategy, max_number, max_duration, max_error_rate
             )
@@ -169,10 +169,15 @@ async def run(
                     if iter_result is not None:
                         if iter_result.request_info.errored:
                             if self._is_max_error_rate_reached(iter_result.run_info):
-                                logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!")
-                            else:
-                                cur_error_rate = iter_result.run_info.errored_requests / iter_result.run_info.end_number
-                                logger.debug(f"Current error rate {cur_error_rate}")
+                                logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) "
+                                            f"reached, sending shutdown signal")
+                                shutdown_event.set()
+                                break
+                            # else:
+                            #     # ToDo: Delete this else clause
+                            #     cur_error_rate = iter_result.run_info.errored_requests / iter_result.run_info.end_number
+                            #     logger.info(f"Current error rate {cur_error_rate}")
+
                         yield iter_result
 
                     # yield control to the event loop
@@ -192,12 +197,15 @@ async def _start_processes(
         manager,
         executor: ProcessPoolExecutor,
         scheduling_strategy: SchedulingStrategy,
+        create_shutdown_event: bool = False
     ) -> tuple[
         list[asyncio.Future],
         multiprocessing.Queue,
         multiprocessing.Queue,
+        Optional[multiprocessing.Event]
     ]:
         await self.worker.prepare_multiprocessing()
+        shutdown_event = manager.Event() if create_shutdown_event else None
         requests_queue = manager.Queue(
             maxsize=scheduling_strategy.queued_requests_limit
         )
@@ -207,6 +215,7 @@ async def _start_processes(
             scheduling_strategy.processes_limit,
             scheduling_strategy.processing_requests_limit,
         )
+        num_processes = 1
         requests_limit_split = (
             scheduling_strategy.processing_requests_limit
             // scheduling_strategy.processes_limit
@@ -234,6 +243,7 @@ async def _start_processes(
                         requests_queue,
                         responses_queue,
                         id_,
+                        shutdown_event,
                     )
                 )
             elif scheduling_strategy.processing_mode == "async":
@@ -245,6 +255,7 @@ async def _start_processes(
                         responses_queue,
                         requests_limit,
                         id_,
+                        shutdown_event,
                     )
                 )
             else:
@@ -255,7 +266,7 @@ async def _start_processes(
 
         await asyncio.sleep(0.1)  # give time for processes to start
 
-        return futures, requests_queue, responses_queue
+        return futures, requests_queue, responses_queue, shutdown_event
 
     def _run_setup(
         self,
@@ -278,13 +289,19 @@ def _run_setup(
                 end_number = iter_length
         except InfiniteDatasetError:  # noqa: BLE001, S110
             if scheduling_strategy.type_ == "constant" and max_duration is not None:
-                end_number = scheduling_strategy.rate * max_duration
+                total_requests_in_max_duration = int(scheduling_strategy.rate * max_duration)
+                if total_requests_in_max_duration < end_number:
+                    assert total_requests_in_max_duration > 0
+                    end_number = total_requests_in_max_duration
             else:
-                # ToDo: Maybe add poison?
+                # ToDo: Add poison
                 raise
         except Exception:
             pass
 
+        if end_number == math.inf and max_error_rate is not None:
+            raise RuntimeError("Can't ensure max_error_rate since can't calculate total requests count")
+
         if end_number == math.inf and end_time is None:
             logger.warning(
                 "No end number or end time set, "
@@ -409,4 +426,5 @@ async def _stop_processes(
         for _ in futures:
             requests_queue.put(None)
 
+        logger.debug("Waiting for futures to shut down")
         await asyncio.gather(*futures)
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index a53b14c2..4515fefa 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -2,10 +2,12 @@
 import math
 import multiprocessing
 import multiprocessing.queues
+import queue
 import time
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
+from datetime import timedelta
 from typing import (
     Any,
     Generic,
@@ -121,9 +123,23 @@ async def resolve(
         ...
 
     async def get_request(
-        self, requests_queue: multiprocessing.Queue
+        self, requests_queue: multiprocessing.Queue,
+            shutdown_event: Optional[multiprocessing.Event] = None,
+            process_id: Optional[int] = None,
     ) -> Optional[WorkerProcessRequest[RequestT]]:
-        return await asyncio.to_thread(requests_queue.get)  # type: ignore[attr-defined]
+        if shutdown_event is not None and process_id is None:
+            logger.warning("shutdown_event is not None and process_id is None which makes it hard to debug")
+
+        def _get_queue_intermittently():
+            assert shutdown_event is not None
+            while True:
+                try:
+                    return requests_queue.get(timeout=timedelta(seconds=1).total_seconds())
+                except queue.Empty:
+                    if shutdown_event.is_set():
+                        logger.info(f"Shutdown signal received in future {process_id}")
+                        return
+        return await asyncio.to_thread(_get_queue_intermittently if shutdown_event is not None else requests_queue.get)  # type: ignore[attr-defined]
 
     async def send_result(
         self,
@@ -149,25 +165,25 @@ async def resolve_scheduler_request(
             scheduled_time=time.time(),
             process_id=process_id,
         )
-        result: WorkerProcessResult[RequestT, ResponseT] = WorkerProcessResult(
+        request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = WorkerProcessResult(
             type_="request_scheduled",
             request=request,
             response=None,
             info=info,
         )
-        asyncio.create_task(self.send_result(results_queue, result))
+        asyncio.create_task(self.send_result(results_queue, request_scheduled_result))
 
         if (wait_time := start_time - time.time()) > 0:
             await asyncio.sleep(wait_time)
 
         info.worker_start = time.time()
-        result = WorkerProcessResult(
+        request_start_result = WorkerProcessResult(
             type_="request_start",
             request=request,
             response=None,
             info=info,
         )
-        asyncio.create_task(self.send_result(results_queue, result))
+        asyncio.create_task(self.send_result(results_queue, request_start_result))
 
         status, response = await self.resolve(request, timeout_time)
         info.worker_end = time.time()
@@ -190,11 +206,20 @@ def process_loop_synchronous(
         requests_queue: multiprocessing.Queue,
         results_queue: multiprocessing.Queue,
         process_id: int,
+        shutdown_event: Optional[multiprocessing.Event] = None,
     ):
         async def _process_runner():
             while (
-                process_request := await self.get_request(requests_queue)
+                process_request := await self.get_request(
+                    requests_queue=requests_queue,
+                    shutdown_event=shutdown_event,
+                    process_id=process_id,
+                )
             ) is not None:
+                if shutdown_event and shutdown_event.is_set():
+                    logger.info(f"Shutdown signal received in future {process_id}")
+                    break
+
                 dequeued_time = time.time()
 
                 await self.resolve_scheduler_request(
@@ -222,6 +247,7 @@ def process_loop_asynchronous(
         results_queue: multiprocessing.Queue,
         max_concurrency: int,
         process_id: int,
+        shutdown_event: Optional[multiprocessing.Event] = None,
     ):
         async def _process_runner():
             pending = asyncio.Semaphore(max_concurrency)
@@ -230,7 +256,10 @@ async def _process_runner():
                 raise ValueError("Async worker called with max_concurrency < 1")
 
             while (
-                process_request := await self.get_request(requests_queue)
+                process_request := await self.get_request(
+                    requests_queue=requests_queue,
+                    shutdown_event=shutdown_event,
+                    process_id=process_id)
             ) is not None:
                 dequeued_time = time.time()
 
@@ -240,6 +269,9 @@ def _task_done(_: asyncio.Task):
                     nonlocal pending
                     pending.release()
 
+                if shutdown_event and shutdown_event.is_set():
+                    logger.info(f"Shutdown signal received in future {process_id}")
+                    break
                 task = asyncio.create_task(
                     self.resolve_scheduler_request(
                         request=process_request.request,
@@ -314,12 +346,14 @@ def process_loop_synchronous(
         requests_queue: multiprocessing.Queue,
         results_queue: multiprocessing.Queue,
         process_id: int,
+        shutdown_event: Optional[multiprocessing.Event] = None
     ):
         asyncio.run(self.backend.validate())
         super().process_loop_synchronous(
             requests_queue=requests_queue,
             results_queue=results_queue,
             process_id=process_id,
+            shutdown_event=shutdown_event,
         )
 
     def process_loop_asynchronous(
@@ -328,6 +362,7 @@ def process_loop_asynchronous(
         results_queue: multiprocessing.Queue,
         max_concurrency: int,
         process_id: int,
+        shutdown_event: Optional[multiprocessing.Event] = None
     ):
         asyncio.run(self.backend.validate())
         super().process_loop_asynchronous(
@@ -335,6 +370,7 @@ def process_loop_asynchronous(
             results_queue=results_queue,
             max_concurrency=max_concurrency,
             process_id=process_id,
+            shutdown_event=shutdown_event,
         )
 
     async def resolve(
@@ -375,7 +411,7 @@ async def resolve(
             request_func, request_kwargs = self._create_request_func_kwargs(request)
 
             async def _runner():
-                # wrap function so we can enforce timeout and
+                # wrap function so that we can enforce timeout and
                 # still return the latest state from the backend
                 async for resp in request_func(**request_kwargs):  # type: ignore[operator]
                     nonlocal response

From ede651aca1bc0fd0de65fc869bea09798f1902c2 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 13:21:19 +0300
Subject: [PATCH 06/37] bugfix / last request not yielded

---
 src/guidellm/scheduler/scheduler.py | 71 ++++++++++++++++-------------
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index c58ef363..628a9ac7 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -72,7 +72,7 @@ async def run(
         scheduling_strategy: SchedulingStrategy,
         max_number: Optional[int] = None,
         max_duration: Optional[float] = None,
-        max_error_rate: Optional[float] = 0,
+        max_error_rate: Optional[float] = None,
     ) -> AsyncGenerator[
         Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None
     ]:
@@ -140,7 +140,8 @@ async def run(
             )
 
             try:
-                while True:
+                max_error_rate_reached = False
+                while not max_error_rate_reached:
                     # check errors and raise them
                     for future in futures:
                         if future.done() and (err := future.exception()) is not None:
@@ -167,17 +168,13 @@ async def run(
                         run_info,
                     )
                     if iter_result is not None:
-                        if iter_result.request_info.errored:
-                            if self._is_max_error_rate_reached(iter_result.run_info):
-                                logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) "
-                                            f"reached, sending shutdown signal")
-                                shutdown_event.set()
-                                break
-                            # else:
-                            #     # ToDo: Delete this else clause
-                            #     cur_error_rate = iter_result.run_info.errored_requests / iter_result.run_info.end_number
-                            #     logger.info(f"Current error rate {cur_error_rate}")
-
+                        if iter_result.request_info.errored \
+                                and not iter_result.request_info.canceled \
+                                and self._is_max_error_rate_reached(iter_result.run_info):
+                            shutdown_event.set()
+                            max_error_rate_reached = True
+                            logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) "
+                                        f"reached, sending shutdown signal")
                         yield iter_result
 
                     # yield control to the event loop
@@ -280,27 +277,10 @@ def _run_setup(
         start_time = time.time()
         times_iter = iter(scheduling_strategy.request_times())
         end_time = time.time() + (max_duration or math.inf)
-        end_number = max_number or math.inf
-
-        try:
-            # update end number if the request loader is finite and less than max
-            iter_length = len(self.request_loader)  # type: ignore[arg-type]
-            if 0 < iter_length < end_number:
-                end_number = iter_length
-        except InfiniteDatasetError:  # noqa: BLE001, S110
-            if scheduling_strategy.type_ == "constant" and max_duration is not None:
-                total_requests_in_max_duration = int(scheduling_strategy.rate * max_duration)
-                if total_requests_in_max_duration < end_number:
-                    assert total_requests_in_max_duration > 0
-                    end_number = total_requests_in_max_duration
-            else:
-                # ToDo: Add poison
-                raise
-        except Exception:
-            pass
+        end_number = self._determine_total_requests_count(scheduling_strategy, max_duration, max_error_rate, max_number)
 
         if end_number == math.inf and max_error_rate is not None:
-            raise RuntimeError("Can't ensure max_error_rate since can't calculate total requests count")
+            logger.warning("max_error_rate will be ignored because end_number can not be determined.")
 
         if end_number == math.inf and end_time is None:
             logger.warning(
@@ -319,6 +299,33 @@ def _run_setup(
 
         return info, requests_iter, times_iter
 
+    def _determine_total_requests_count(
+            self,
+            scheduling_strategy: SchedulingStrategy,
+            max_duration: Optional[float],
+            max_error_rate: Optional[float],
+            max_number: Optional[int],
+    ) -> int:
+        end_number = max_number or math.inf
+        try:
+            # update end number if the request loader is finite and less than max
+            iter_length = len(self.request_loader)  # type: ignore[arg-type]
+            if 0 < iter_length < end_number:
+                end_number = iter_length
+        except InfiniteDatasetError:  # noqa: BLE001, S110
+            if scheduling_strategy.type_ == "constant" and max_duration is not None:
+                total_requests_in_max_duration = int(scheduling_strategy.rate * max_duration)
+                if total_requests_in_max_duration < end_number:
+                    assert total_requests_in_max_duration > 0
+                    end_number = total_requests_in_max_duration
+            else:
+                if max_error_rate:
+                    logger.warning()
+                raise
+        except Exception:
+            pass
+        return end_number
+
     def _add_requests(
         self,
         requests_iter: Optional[Iterator[Any]],

From a17117c7dc3d973fd328d6754083dc9471db01b1 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 13:21:47 +0300
Subject: [PATCH 07/37] Add max error rate to readme, CLI & report

---
 README.md                             |  2 ++
 src/guidellm/__main__.py              | 12 ++++++++++++
 src/guidellm/benchmark/aggregator.py  |  3 +++
 src/guidellm/benchmark/benchmark.py   |  9 +++++++++
 src/guidellm/benchmark/benchmarker.py |  9 +++++++++
 src/guidellm/benchmark/entrypoints.py |  2 ++
 6 files changed, 37 insertions(+)

diff --git a/README.md b/README.md
index a46fd411..416d3cc1 100644
--- a/README.md
+++ b/README.md
@@ -147,6 +147,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative
 
 - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
 
+- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None`, benchmarks will continue regardless of error rate.
+
 - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
 
 - `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.
diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index f38b11aa..baea9f13 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -163,6 +163,16 @@ def cli():
         "If None, will run until max_seconds or the data is exhausted."
     ),
 )
+@click.option(
+    "--max-error-rate",
+    type=float,
+    help=(
+        "The maximum error rate after which a benchmark will stop. "
+        "Applicable only for finite deterministic scenarios i.e rate_type is 'constant' and 'max_seconds' exists OR "
+        "'max_requests' exists OR the dataset is finite. "
+        "If None, benchmarks will continue regardless of error rate."
+    ),
+)
 @click.option(
     "--warmup-percent",
     type=float,
@@ -242,6 +252,7 @@ def benchmark(
     rate,
     max_seconds,
     max_requests,
+    max_error_rate,
     warmup_percent,
     cooldown_percent,
     disable_progress,
@@ -267,6 +278,7 @@ def benchmark(
             rate=rate,
             max_seconds=max_seconds,
             max_requests=max_requests,
+            max_error_rate=max_error_rate,
             warmup_percent=warmup_percent,
             cooldown_percent=cooldown_percent,
             show_progress=not disable_progress,
diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py
index 9943f169..9fe80be8 100644
--- a/src/guidellm/benchmark/aggregator.py
+++ b/src/guidellm/benchmark/aggregator.py
@@ -599,6 +599,8 @@ def compile(self) -> GenerativeBenchmark:
         and return the compiled object.
         """
         successful, incomplete, errored = self._compile_results()
+        error_rate = self.requests_stats.totals.errored.total / \
+            (self.requests_stats.totals.successful + self.requests_stats.totals.errored.total)
 
         return GenerativeBenchmark.from_stats(
             run_id=self.run_id,
@@ -625,6 +627,7 @@ def compile(self) -> GenerativeBenchmark:
                 request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean,
                 request_time_delay_avg=self.requests_stats.request_time_delay.mean,
                 request_time_avg=self.requests_stats.request_time.mean,
+                error_rate=error_rate,
             ),
             worker=self.worker_description,
             requests_loader=self.request_loader_description,
diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
index 50d2f49c..dee71fb7 100644
--- a/src/guidellm/benchmark/benchmark.py
+++ b/src/guidellm/benchmark/benchmark.py
@@ -90,6 +90,9 @@ class BenchmarkArgs(StandardBaseModel):
     max_duration: Optional[float] = Field(
         description="The maximum duration in seconds to run this benchmark, if any."
     )
+    max_error_rate: Optional[float] = Field(
+        description="Maximum error rate after which a benchmark will stop."
+    )
     warmup_number: Optional[int] = Field(
         description=(
             "The number of requests to run for the warmup phase of this benchmark, "
@@ -213,6 +216,12 @@ class BenchmarkRunStats(StandardBaseModel):
             "it was completed."
         )
     )
+    error_rate: float = Field(
+        description=(
+            "The number of errored requests divided by the number of errored requests. This can be higher "
+            "than max_error_rate (if applicable) cause it does not take into account incomplete requests."
+        )
+    )
 
 
 class BenchmarkMetrics(StandardBaseModel):
diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py
index 11b6d245..7da25a3b 100644
--- a/src/guidellm/benchmark/benchmarker.py
+++ b/src/guidellm/benchmark/benchmarker.py
@@ -74,6 +74,11 @@ class BenchmarkerStrategyLimits(StandardBaseModel):
         description="Maximum duration (in seconds) to process requests per strategy.",
         ge=0,
     )
+    max_error_rate: Optional[float] = Field(
+        description="Maximum error rate after which a sync benchmark will stop",
+        ge=0,
+        le=1,
+    )
     warmup_percent_per_strategy: Optional[float] = Field(
         description="Percentage of requests to use for warmup.",
         ge=0,
@@ -148,6 +153,7 @@ async def run(
         profile: Profile,
         max_number_per_strategy: Optional[int],
         max_duration_per_strategy: Optional[float],
+        max_error_rate: Optional[float],
         warmup_percent_per_strategy: Optional[float],
         cooldown_percent_per_strategy: Optional[float],
     ) -> AsyncGenerator[
@@ -162,6 +168,7 @@ async def run(
             requests_loader_size=requests_loader_size,
             max_number_per_strategy=max_number_per_strategy,
             max_duration_per_strategy=max_duration_per_strategy,
+            max_error_rate=max_error_rate,
             warmup_percent_per_strategy=warmup_percent_per_strategy,
             cooldown_percent_per_strategy=cooldown_percent_per_strategy,
         )
@@ -196,6 +203,7 @@ async def run(
                 scheduling_strategy=scheduling_strategy,
                 max_number=max_number_per_strategy,
                 max_duration=max_duration_per_strategy,
+                max_error_rate=max_error_rate,
             ):
                 if result.type_ == "run_start":
                     yield BenchmarkerResult(
@@ -321,6 +329,7 @@ def create_benchmark_aggregator(
                 strategy=strategy,
                 max_number=limits.max_number,
                 max_duration=limits.max_duration,
+                max_error_rate=limits.max_error_rate,
                 warmup_number=limits.warmup_number,
                 warmup_duration=limits.warmup_duration,
                 cooldown_number=limits.cooldown_number,
diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
index f252cf27..7e4af8c0 100644
--- a/src/guidellm/benchmark/entrypoints.py
+++ b/src/guidellm/benchmark/entrypoints.py
@@ -41,6 +41,7 @@ async def benchmark_generative_text(
     rate: Optional[Union[int, float, list[Union[int, float]]]],
     max_seconds: Optional[float],
     max_requests: Optional[int],
+    max_error_rate: Optional[float],
     warmup_percent: Optional[float],
     cooldown_percent: Optional[float],
     show_progress: bool,
@@ -107,6 +108,7 @@ async def benchmark_generative_text(
         profile=profile,
         max_number_per_strategy=max_requests,
         max_duration_per_strategy=max_seconds,
+        max_error_rate=max_error_rate,
         warmup_percent_per_strategy=warmup_percent,
         cooldown_percent_per_strategy=cooldown_percent,
     ):

From 34cb6b6cbd3a1c9efe72f7db8dd8578936dd92cd Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 13:43:34 +0300
Subject: [PATCH 08/37] make max_error_rate optional

---
 src/guidellm/scheduler/result.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
index a340932d..4159f8f3 100644
--- a/src/guidellm/scheduler/result.py
+++ b/src/guidellm/scheduler/result.py
@@ -46,7 +46,7 @@ class SchedulerRunInfo(StandardBaseModel):
     end_number: float  # ToDo: Rename to max_requests & change to int (check all references before)
     processes: int
     strategy: SchedulingStrategy
-    max_error_rate: float
+    max_error_rate: Optional[float] = None
 
     created_requests: int = 0
     queued_requests: int = 0

From 6289c07e4e8aed4aa7bfbd6223ea401f2ca3993c Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 13:48:12 +0300
Subject: [PATCH 09/37] minor fixes

---
 src/guidellm/scheduler/scheduler.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 628a9ac7..07d4b2e1 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -277,7 +277,7 @@ def _run_setup(
         start_time = time.time()
         times_iter = iter(scheduling_strategy.request_times())
         end_time = time.time() + (max_duration or math.inf)
-        end_number = self._determine_total_requests_count(scheduling_strategy, max_duration, max_error_rate, max_number)
+        end_number = self._determine_total_requests_count(scheduling_strategy, max_duration, max_number)
 
         if end_number == math.inf and max_error_rate is not None:
             logger.warning("max_error_rate will be ignored because end_number can not be determined.")
@@ -303,7 +303,6 @@ def _determine_total_requests_count(
             self,
             scheduling_strategy: SchedulingStrategy,
             max_duration: Optional[float],
-            max_error_rate: Optional[float],
             max_number: Optional[int],
     ) -> int:
         end_number = max_number or math.inf
@@ -318,10 +317,6 @@ def _determine_total_requests_count(
                 if total_requests_in_max_duration < end_number:
                     assert total_requests_in_max_duration > 0
                     end_number = total_requests_in_max_duration
-            else:
-                if max_error_rate:
-                    logger.warning()
-                raise
         except Exception:
             pass
         return end_number

From d5ee01822affd222141d2e6845b921d8f09e467f Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 14:12:11 +0300
Subject: [PATCH 10/37] reprot error rate bugfix

---
 src/guidellm/benchmark/aggregator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py
index 9fe80be8..b66ae1f7 100644
--- a/src/guidellm/benchmark/aggregator.py
+++ b/src/guidellm/benchmark/aggregator.py
@@ -600,7 +600,7 @@ def compile(self) -> GenerativeBenchmark:
         """
         successful, incomplete, errored = self._compile_results()
         error_rate = self.requests_stats.totals.errored.total / \
-            (self.requests_stats.totals.successful + self.requests_stats.totals.errored.total)
+            (self.requests_stats.totals.successful.total + self.requests_stats.totals.errored.total)
 
         return GenerativeBenchmark.from_stats(
             run_id=self.run_id,

From ce13ef7294d448c0d03a32ef4a70699188617942 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 14:12:24 +0300
Subject: [PATCH 11/37] add current error rate log

---
 src/guidellm/scheduler/scheduler.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 07d4b2e1..1edc4286 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -168,13 +168,17 @@ async def run(
                         run_info,
                     )
                     if iter_result is not None:
-                        if iter_result.request_info.errored \
-                                and not iter_result.request_info.canceled \
-                                and self._is_max_error_rate_reached(iter_result.run_info):
-                            shutdown_event.set()
-                            max_error_rate_reached = True
-                            logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) "
-                                        f"reached, sending shutdown signal")
+                        if iter_result.request_info.errored and not iter_result.request_info.canceled:
+                            current_error_rate = run_info.errored_requests / run_info.end_number
+                            is_over_max_error_rate = run_info.max_error_rate < current_error_rate
+
+                            if is_over_max_error_rate:
+                                shutdown_event.set()
+                                max_error_rate_reached = True
+                                logger.info(f"Max error rate of ({iter_result.run_info.max_error_rate}) "
+                                            f"reached, sending shutdown signal")
+                            else:
+                                logger.debug(f"Current error rate: {current_error_rate}")
                         yield iter_result
 
                     # yield control to the event loop
@@ -415,11 +419,6 @@ def _check_result_ready(
             )
         raise ValueError(f"Invalid process response type: {process_response}")
 
-    @staticmethod
-    def _is_max_error_rate_reached(run_info: SchedulerRunInfo) -> bool:
-        current_error_rate = run_info.errored_requests / run_info.end_number
-        return current_error_rate > run_info.max_error_rate
-
     async def _stop_processes(
         self,
         futures: list[asyncio.Future],

From 9a68a7687360048f62b5fb880a9bce95fe1313ea Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 14:19:03 +0300
Subject: [PATCH 12/37] remove todo

---
 src/guidellm/scheduler/result.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
index 4159f8f3..f899f54a 100644
--- a/src/guidellm/scheduler/result.py
+++ b/src/guidellm/scheduler/result.py
@@ -43,7 +43,7 @@ class SchedulerRunInfo(StandardBaseModel):
 
     start_time: float
     end_time: float
-    end_number: float  # ToDo: Rename to max_requests & change to int (check all references before)
+    end_number: float
     processes: int
     strategy: SchedulingStrategy
     max_error_rate: Optional[float] = None

From 6dd313de3b4275ec87fdb9c76685602d6a806e76 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 15:58:03 +0300
Subject: [PATCH 13/37] Fix tests

---
 src/guidellm/benchmark/output.py    | 1 +
 src/guidellm/scheduler/scheduler.py | 6 ++++--
 tests/unit/benchmark/test_output.py | 2 +-
 tests/unit/mock_benchmark.py        | 2 ++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py
index 4847160d..33b1efc2 100644
--- a/src/guidellm/benchmark/output.py
+++ b/src/guidellm/benchmark/output.py
@@ -419,6 +419,7 @@ def benchmarks_args_str(self) -> str:
             {
                 "max_number": args.max_number,
                 "max_duration": args.max_duration,
+                "max_error_rate": args.max_error_rate,
                 "warmup_number": args.warmup_number,
                 "warmup_duration": args.warmup_duration,
                 "cooldown_number": args.cooldown_number,
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 1edc4286..3dd873d0 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -315,13 +315,15 @@ def _determine_total_requests_count(
             iter_length = len(self.request_loader)  # type: ignore[arg-type]
             if 0 < iter_length < end_number:
                 end_number = iter_length
-        except InfiniteDatasetError:  # noqa: BLE001, S110
+        except InfiniteDatasetError:
+            # Only when RPS is constant and duration is capped we can determine the total
+            # amount of requests that are supposed to be sent
             if scheduling_strategy.type_ == "constant" and max_duration is not None:
                 total_requests_in_max_duration = int(scheduling_strategy.rate * max_duration)
                 if total_requests_in_max_duration < end_number:
                     assert total_requests_in_max_duration > 0
                     end_number = total_requests_in_max_duration
-        except Exception:
+        except Exception:  # noqa: BLE001, S110
             pass
         return end_number
 
diff --git a/tests/unit/benchmark/test_output.py b/tests/unit/benchmark/test_output.py
index 9076834b..e3114491 100644
--- a/tests/unit/benchmark/test_output.py
+++ b/tests/unit/benchmark/test_output.py
@@ -113,7 +113,7 @@ def test_console_benchmarks_args_str():
     mock_benchmark = mock_generative_benchmark()
     console.benchmarks = [mock_benchmark]
     assert console.benchmarks_args_str == (
-        "max_number=None, max_duration=10.0, warmup_number=None, "
+        "max_number=None, max_duration=10.0, max_error_rate=0.05, warmup_number=None, "
         "warmup_duration=None, cooldown_number=None, cooldown_duration=None"
     )
 
diff --git a/tests/unit/mock_benchmark.py b/tests/unit/mock_benchmark.py
index 81364fa1..3c360c68 100644
--- a/tests/unit/mock_benchmark.py
+++ b/tests/unit/mock_benchmark.py
@@ -221,6 +221,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark:
             strategy=SynchronousStrategy(),
             max_number=None,
             max_duration=10.0,
+            max_error_rate=0.05,
             warmup_number=None,
             warmup_duration=None,
             cooldown_number=None,
@@ -245,6 +246,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark:
             request_start_time_targeted_delay_avg=1.2827096836907523,
             request_time_delay_avg=0.0004316908972603934,
             request_time_avg=1.426228676523481,
+            error_rate=0.345346,
         ),
         worker=GenerativeRequestsWorkerDescription(
             backend_type="openai_http",

From 3697b308cd87c34e84370fdd6da04ef29c1a5ae9 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 16:14:49 +0300
Subject: [PATCH 14/37] Pre CR fixes

---
 README.md                           | 2 +-
 src/guidellm/__main__.py            | 2 +-
 src/guidellm/benchmark/benchmark.py | 2 +-
 src/guidellm/scheduler/scheduler.py | 5 +++--
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 416d3cc1..0988c70e 100644
--- a/README.md
+++ b/README.md
@@ -147,7 +147,7 @@ The `guidellm benchmark` command is used to run benchmarks against a generative
 
 - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
 
-- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None`, benchmarks will continue regardless of error rate.
+- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None` or not applicable, benchmarks will continue regardless of error rate.
 
 - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
 
diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index baea9f13..5628857b 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -170,7 +170,7 @@ def cli():
         "The maximum error rate after which a benchmark will stop. "
         "Applicable only for finite deterministic scenarios i.e rate_type is 'constant' and 'max_seconds' exists OR "
         "'max_requests' exists OR the dataset is finite. "
-        "If None, benchmarks will continue regardless of error rate."
+        "If None or not applicable, benchmarks will continue regardless of error rate."
     ),
 )
 @click.option(
diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
index dee71fb7..dd391bfc 100644
--- a/src/guidellm/benchmark/benchmark.py
+++ b/src/guidellm/benchmark/benchmark.py
@@ -710,7 +710,7 @@ def from_stats(
             *["incomplete"] * len(incomplete),  # type: ignore[list-item]
             *["error"] * len(errored),  # type: ignore[list-item]
         ]
-        start_time = min(req.start_time for req in total) # ToDo: Fix if total is empty
+        start_time = min(req.start_time for req in total)
         end_time = max(req.end_time for req in total)
 
         total_with_prompt, total_types_with_prompt = (
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 3dd873d0..c92bdc76 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -102,7 +102,8 @@ async def run(
             If None, then no limit is set and either the iterator must be exhaustible
             or the max_number must be set.
         :param max_error_rate: The maximum error rate after which the scheduler shuts down.
-            If not provided a default of 5% i.e 0.05 is used.
+            Only applicable in benchmarks with finite deterministic number of requests.
+            If None or not applicable then scheduler will continue regardless of errors.
         :return: An asynchronous generator that yields SchedulerResult objects.
             Each SchedulerResult object contains information about the request,
             the response, and the run information.
@@ -130,7 +131,7 @@ async def run(
                 manager, executor, scheduling_strategy, max_error_rate is not None
             )
             if shutdown_event:
-                assert not shutdown_event.is_set()
+                assert not shutdown_event.is_set(),  "shutdown_event is set before starting scheduling"
             run_info, requests_iter, times_iter = self._run_setup(
                 futures, scheduling_strategy, max_number, max_duration, max_error_rate
             )

From 2fe64c7092265be8e9a2f6543fc7af9968930703 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 16:19:04 +0300
Subject: [PATCH 15/37] CR Fixes

---
 src/guidellm/benchmark/benchmarker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py
index 7da25a3b..ecb721f7 100644
--- a/src/guidellm/benchmark/benchmarker.py
+++ b/src/guidellm/benchmark/benchmarker.py
@@ -75,7 +75,7 @@ class BenchmarkerStrategyLimits(StandardBaseModel):
         ge=0,
     )
     max_error_rate: Optional[float] = Field(
-        description="Maximum error rate after which a sync benchmark will stop",
+        description="Maximum error rate after which a benchmark will stop",
         ge=0,
         le=1,
     )

From b54ab14d668a8af007cf9382b29917ccee994764 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 18:14:40 +0300
Subject: [PATCH 16/37] Lint fixes

---
 src/guidellm/__main__.py             |  3 +-
 src/guidellm/benchmark/aggregator.py | 10 ++-
 src/guidellm/benchmark/benchmark.py  |  6 +-
 src/guidellm/request/__init__.py     |  2 +
 src/guidellm/request/loader.py       | 11 ++--
 src/guidellm/scheduler/scheduler.py  | 93 ++++++++++++++++------------
 src/guidellm/scheduler/worker.py     | 21 +++++--
 7 files changed, 93 insertions(+), 53 deletions(-)

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index 5628857b..8a1b9ff0 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -168,7 +168,8 @@ def cli():
     type=float,
     help=(
         "The maximum error rate after which a benchmark will stop. "
-        "Applicable only for finite deterministic scenarios i.e rate_type is 'constant' and 'max_seconds' exists OR "
+        "Applicable only for finite deterministic scenarios i.e "
+        "rate_type is 'constant' and 'max_seconds' exists OR "
         "'max_requests' exists OR the dataset is finite. "
         "If None or not applicable, benchmarks will continue regardless of error rate."
     ),
diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py
index b66ae1f7..73ae622a 100644
--- a/src/guidellm/benchmark/aggregator.py
+++ b/src/guidellm/benchmark/aggregator.py
@@ -599,8 +599,8 @@ def compile(self) -> GenerativeBenchmark:
         and return the compiled object.
         """
         successful, incomplete, errored = self._compile_results()
-        error_rate = self.requests_stats.totals.errored.total / \
-            (self.requests_stats.totals.successful.total + self.requests_stats.totals.errored.total)
+
+        error_rate = self._calculate_error_rate()
 
         return GenerativeBenchmark.from_stats(
             run_id=self.run_id,
@@ -634,6 +634,12 @@ def compile(self) -> GenerativeBenchmark:
             extras=self.extras,
         )
 
+    def _calculate_error_rate(self) -> float:
+        total_successful = self.requests_stats.totals.successful.total
+        total_errored = self.requests_stats.totals.errored.total
+        total_sent = total_errored + total_successful
+        return total_errored / total_sent
+
     def _compile_results(
         self,
     ) -> tuple[
diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
index dd391bfc..40ffefba 100644
--- a/src/guidellm/benchmark/benchmark.py
+++ b/src/guidellm/benchmark/benchmark.py
@@ -218,8 +218,10 @@ class BenchmarkRunStats(StandardBaseModel):
     )
     error_rate: float = Field(
         description=(
-            "The number of errored requests divided by the number of errored requests. This can be higher "
-            "than max_error_rate (if applicable) cause it does not take into account incomplete requests."
+            "The number of errored requests divided by the number "
+            "of errored requests. This can be higher than max_error_rate "
+            "(if applicable) cause it does not take into "
+            "account incomplete requests."
         )
     )
 
diff --git a/src/guidellm/request/__init__.py b/src/guidellm/request/__init__.py
index db3059cc..606fb897 100644
--- a/src/guidellm/request/__init__.py
+++ b/src/guidellm/request/__init__.py
@@ -1,6 +1,7 @@
 from .loader import (
     GenerativeRequestLoader,
     GenerativeRequestLoaderDescription,
+    GetInfiniteDatasetLengthError,
     RequestLoader,
     RequestLoaderDescription,
 )
@@ -10,6 +11,7 @@
     "GenerationRequest",
     "GenerativeRequestLoader",
     "GenerativeRequestLoaderDescription",
+    "GetInfiniteDatasetLengthError",
     "RequestLoader",
     "RequestLoaderDescription",
 ]
diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py
index 0e54fc45..62bd17ea 100644
--- a/src/guidellm/request/loader.py
+++ b/src/guidellm/request/loader.py
@@ -19,13 +19,13 @@
 __all__ = [
     "GenerativeRequestLoader",
     "GenerativeRequestLoaderDescription",
+    "GetInfiniteDatasetLengthError",
     "RequestLoader",
     "RequestLoaderDescription",
-    "InfiniteDatasetError"
 ]
 
 
-class InfiniteDatasetError(Exception):
+class GetInfiniteDatasetLengthError(Exception):
     pass
 
 
@@ -125,8 +125,11 @@ def __len__(self) -> int:
         if self.iter_type == "finite":
             return self.num_unique_items()
 
-        assert self.iter_type == "infinite"
-        raise InfiniteDatasetError(f"Dataset {self.data} is infinite and thus unable to determine length")
+        if self.iter_type != "infinite":
+            raise ValueError(f"Invalid iter_type {self.iter_type}")
+        raise GetInfiniteDatasetLengthError(f"Dataset {self.data} is "
+                                            f"infinite and thus "
+                                            f"unable to determine length")
 
     @property
     def description(self) -> GenerativeRequestLoaderDescription:
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index c92bdc76..6bdcbcfe 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -15,7 +15,7 @@
 from loguru import logger
 
 from guidellm.config import settings
-from guidellm.request.loader import InfiniteDatasetError
+from guidellm.request.loader import GetInfiniteDatasetLengthError
 from guidellm.scheduler.result import (
     SchedulerRequestResult,
     SchedulerResult,
@@ -101,24 +101,15 @@ async def run(
         :param max_duration: The maximum duration for the scheduling run.
             If None, then no limit is set and either the iterator must be exhaustible
             or the max_number must be set.
-        :param max_error_rate: The maximum error rate after which the scheduler shuts down.
+        :param max_error_rate: The maximum error rate after which the
+            scheduler shuts down.
             Only applicable in benchmarks with finite deterministic number of requests.
             If None or not applicable then scheduler will continue regardless of errors.
         :return: An asynchronous generator that yields SchedulerResult objects.
             Each SchedulerResult object contains information about the request,
             the response, and the run information.
         """
-        if scheduling_strategy is None or not isinstance(
-            scheduling_strategy, SchedulingStrategy
-        ):
-            raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}")
-
-        if max_number is not None and max_number < 1:
-            raise ValueError(f"Invalid max_number: {max_number}")
-        if max_duration is not None and max_duration < 0:
-            raise ValueError(f"Invalid max_duration: {max_duration}")
-        if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1):
-            raise ValueError(f"Invalid max_error_rate: {max_error_rate}")
+        self._validate_scheduler_params(scheduling_strategy, max_duration, max_error_rate, max_number)
 
         with (
             multiprocessing.Manager() as manager,
@@ -127,11 +118,13 @@ async def run(
             ) as executor,
         ):
             requests_iter: Optional[Iterator[Any]] = None
-            futures, requests_queue, responses_queue, shutdown_event = await self._start_processes(
-                manager, executor, scheduling_strategy, max_error_rate is not None
-            )
-            if shutdown_event:
-                assert not shutdown_event.is_set(),  "shutdown_event is set before starting scheduling"
+            futures, requests_queue, responses_queue, shutdown_event = \
+                await self._start_processes(
+                    manager, executor, scheduling_strategy, max_error_rate is not None)
+            if shutdown_event and shutdown_event.is_set():
+                raise RuntimeError(
+                    "shutdown_event is set before starting scheduling"
+                )
             run_info, requests_iter, times_iter = self._run_setup(
                 futures, scheduling_strategy, max_number, max_duration, max_error_rate
             )
@@ -169,17 +162,14 @@ async def run(
                         run_info,
                     )
                     if iter_result is not None:
-                        if iter_result.request_info.errored and not iter_result.request_info.canceled:
-                            current_error_rate = run_info.errored_requests / run_info.end_number
-                            is_over_max_error_rate = run_info.max_error_rate < current_error_rate
-
-                            if is_over_max_error_rate:
-                                shutdown_event.set()
-                                max_error_rate_reached = True
-                                logger.info(f"Max error rate of ({iter_result.run_info.max_error_rate}) "
-                                            f"reached, sending shutdown signal")
-                            else:
-                                logger.debug(f"Current error rate: {current_error_rate}")
+                        if iter_result.request_info.errored \
+                            and not iter_result.request_info.canceled \
+                                and self._is_max_error_rate_reached(iter_result.run_info):
+                            shutdown_event.set()
+                            max_error_rate_reached = True
+                            logger.info(f"Max error rate of "
+                                        f"({iter_result.run_info.max_error_rate}) "
+                                        f"reached, sending shutdown signal")
                         yield iter_result
 
                     # yield control to the event loop
@@ -194,6 +184,28 @@ async def run(
 
             await self._stop_processes(futures, requests_queue)
 
+    def _validate_scheduler_params(
+            self,
+            scheduling_strategy: SchedulingStrategy,
+            max_duration: Optional[float],
+            max_error_rate: Optional[float],
+            max_number: Optional[int]
+    ) -> None:
+        if scheduling_strategy is None or not isinstance(
+                scheduling_strategy, SchedulingStrategy
+        ):
+            raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}")
+        if max_number is not None and max_number < 1:
+            raise ValueError(f"Invalid max_number: {max_number}")
+        if max_duration is not None and max_duration < 0:
+            raise ValueError(f"Invalid max_duration: {max_duration}")
+        if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1):
+            raise ValueError(f"Invalid max_error_rate: {max_error_rate}")
+
+    def _is_max_error_rate_reached(self, run_info) -> bool:
+        current_error_rate = run_info.errored_requests / run_info.end_number
+        return run_info.max_error_rate < current_error_rate
+
     async def _start_processes(
         self,
         manager,
@@ -282,10 +294,13 @@ def _run_setup(
         start_time = time.time()
         times_iter = iter(scheduling_strategy.request_times())
         end_time = time.time() + (max_duration or math.inf)
-        end_number = self._determine_total_requests_count(scheduling_strategy, max_duration, max_number)
+        end_number = self._determine_total_requests_count(
+            scheduling_strategy, max_duration, max_number
+        )
 
         if end_number == math.inf and max_error_rate is not None:
-            logger.warning("max_error_rate will be ignored because end_number can not be determined.")
+            logger.warning("max_error_rate will be ignored "
+                           "because end_number can not be determined.")
 
         if end_number == math.inf and end_time is None:
             logger.warning(
@@ -312,17 +327,19 @@ def _determine_total_requests_count(
     ) -> int:
         end_number = max_number or math.inf
         try:
-            # update end number if the request loader is finite and less than max
+            # update end_number if the request_loader is finite and less than max_number
             iter_length = len(self.request_loader)  # type: ignore[arg-type]
             if 0 < iter_length < end_number:
                 end_number = iter_length
-        except InfiniteDatasetError:
-            # Only when RPS is constant and duration is capped we can determine the total
-            # amount of requests that are supposed to be sent
+        except GetInfiniteDatasetLengthError:
+            # Only when RPS is constant and duration is
+            # capped we can determine the total amount of requests
+            # that are supposed to be sent
             if scheduling_strategy.type_ == "constant" and max_duration is not None:
-                total_requests_in_max_duration = int(scheduling_strategy.rate * max_duration)
-                if total_requests_in_max_duration < end_number:
-                    assert total_requests_in_max_duration > 0
+                total_requests_in_max_duration = int(
+                    scheduling_strategy.rate * max_duration
+                )
+                if 0 < total_requests_in_max_duration < end_number:
                     end_number = total_requests_in_max_duration
         except Exception:  # noqa: BLE001, S110
             pass
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index 4515fefa..800207a0 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -128,18 +128,26 @@ async def get_request(
             process_id: Optional[int] = None,
     ) -> Optional[WorkerProcessRequest[RequestT]]:
         if shutdown_event is not None and process_id is None:
-            logger.warning("shutdown_event is not None and process_id is None which makes it hard to debug")
+            logger.warning("shutdown_event is not None and process_id "
+                           "is None which makes it hard to debug")
 
         def _get_queue_intermittently():
-            assert shutdown_event is not None
+            if shutdown_event is None:
+                raise ValueError("Shouldn't use _get_queue_intermittently "
+                                 "if there's no shutdown_even")
             while True:
                 try:
-                    return requests_queue.get(timeout=timedelta(seconds=1).total_seconds())
+                    get_timeout = timedelta(seconds=1).total_seconds()
+                    return requests_queue.get(timeout=get_timeout)
                 except queue.Empty:
                     if shutdown_event.is_set():
                         logger.info(f"Shutdown signal received in future {process_id}")
-                        return
-        return await asyncio.to_thread(_get_queue_intermittently if shutdown_event is not None else requests_queue.get)  # type: ignore[attr-defined]
+                        return None
+
+        get_method = _get_queue_intermittently \
+            if shutdown_event is not None \
+            else requests_queue.get
+        return await asyncio.to_thread(get_method)  # type: ignore[attr-defined]
 
     async def send_result(
         self,
@@ -165,7 +173,8 @@ async def resolve_scheduler_request(
             scheduled_time=time.time(),
             process_id=process_id,
         )
-        request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = WorkerProcessResult(
+        request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = \
+            WorkerProcessResult(
             type_="request_scheduled",
             request=request,
             response=None,

From b502c9488cd497831a821f91291a42eecfe01c33 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 21 May 2025 18:16:22 +0300
Subject: [PATCH 17/37] Lint fixes

---
 src/guidellm/scheduler/scheduler.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 6bdcbcfe..db505181 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -109,7 +109,10 @@ async def run(
             Each SchedulerResult object contains information about the request,
             the response, and the run information.
         """
-        self._validate_scheduler_params(scheduling_strategy, max_duration, max_error_rate, max_number)
+        self._validate_scheduler_params(scheduling_strategy,
+                                        max_duration,
+                                        max_error_rate,
+                                        max_number)
 
         with (
             multiprocessing.Manager() as manager,
@@ -163,8 +166,8 @@ async def run(
                     )
                     if iter_result is not None:
                         if iter_result.request_info.errored \
-                            and not iter_result.request_info.canceled \
-                                and self._is_max_error_rate_reached(iter_result.run_info):
+                        and not iter_result.request_info.canceled \
+                        and self._is_max_error_rate_reached(iter_result.run_info):
                             shutdown_event.set()
                             max_error_rate_reached = True
                             logger.info(f"Max error rate of "

From 332ef08a5084c3846a38d444c815196cf3190266 Mon Sep 17 00:00:00 2001
From: markvaykhansky <mark@jounce.io>
Date: Wed, 21 May 2025 19:14:21 +0300
Subject: [PATCH 18/37] better var name

---
 src/guidellm/benchmark/aggregator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py
index 73ae622a..cd725326 100644
--- a/src/guidellm/benchmark/aggregator.py
+++ b/src/guidellm/benchmark/aggregator.py
@@ -637,8 +637,8 @@ def compile(self) -> GenerativeBenchmark:
     def _calculate_error_rate(self) -> float:
         total_successful = self.requests_stats.totals.successful.total
         total_errored = self.requests_stats.totals.errored.total
-        total_sent = total_errored + total_successful
-        return total_errored / total_sent
+        total_finished = total_errored + total_successful
+        return total_errored / total_finished
 
     def _compile_results(
         self,

From c2fd813233fe0cdd253796205464e6e6167deeff Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Thu, 22 May 2025 08:33:33 +0300
Subject: [PATCH 19/37] Type fixes, typos & bugfixes

---
 src/guidellm/__main__.py            |  4 ++--
 src/guidellm/scheduler/scheduler.py | 16 ++++++++++++----
 src/guidellm/scheduler/worker.py    | 15 ++++++++-------
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index 8a1b9ff0..bfa566b2 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -179,7 +179,7 @@ def cli():
     type=float,
     default=None,
     help=(
-        "The percent of the benchmark (based on max-seconds, max-requets, "
+        "The percent of the benchmark (based on max-seconds, max-requests, "
         "or lenth of dataset) to run as a warmup and not include in the final results. "
         "Defaults to None."
     ),
@@ -188,7 +188,7 @@ def cli():
     "--cooldown-percent",
     type=float,
     help=(
-        "The percent of the benchmark (based on max-seconds, max-requets, or lenth "
+        "The percent of the benchmark (based on max-seconds, max-requests, or length "
         "of dataset) to run as a cooldown and not include in the final results. "
         "Defaults to None."
     ),
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index db505181..ceffecd3 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -5,6 +5,7 @@
 import time
 from collections.abc import AsyncGenerator, Iterable, Iterator
 from concurrent.futures import ProcessPoolExecutor
+from multiprocessing.synchronize import Event as MultiprocessingEvent
 from typing import (
     Any,
     Generic,
@@ -168,11 +169,15 @@ async def run(
                         if iter_result.request_info.errored \
                         and not iter_result.request_info.canceled \
                         and self._is_max_error_rate_reached(iter_result.run_info):
+                            if shutdown_event is None:
+                                raise RuntimeError("We've reached max_error_rate "
+                                                   "but shutdown_event is corrupt")
                             shutdown_event.set()
                             max_error_rate_reached = True
                             logger.info(f"Max error rate of "
                                         f"({iter_result.run_info.max_error_rate}) "
                                         f"reached, sending shutdown signal")
+                        logger.info("Itter is not None")
                         yield iter_result
 
                     # yield control to the event loop
@@ -205,8 +210,12 @@ def _validate_scheduler_params(
         if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1):
             raise ValueError(f"Invalid max_error_rate: {max_error_rate}")
 
-    def _is_max_error_rate_reached(self, run_info) -> bool:
+    def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool:
+        if run_info.max_error_rate is None:
+            return False
         current_error_rate = run_info.errored_requests / run_info.end_number
+        logger.info(f"Current error rate {current_error_rate} "
+                    f"i.e total_finished [success / error] / max total possible")
         return run_info.max_error_rate < current_error_rate
 
     async def _start_processes(
@@ -219,7 +228,7 @@ async def _start_processes(
         list[asyncio.Future],
         multiprocessing.Queue,
         multiprocessing.Queue,
-        Optional[multiprocessing.Event]
+        Optional[MultiprocessingEvent]
     ]:
         await self.worker.prepare_multiprocessing()
         shutdown_event = manager.Event() if create_shutdown_event else None
@@ -232,7 +241,6 @@ async def _start_processes(
             scheduling_strategy.processes_limit,
             scheduling_strategy.processing_requests_limit,
         )
-        num_processes = 1
         requests_limit_split = (
             scheduling_strategy.processing_requests_limit
             // scheduling_strategy.processes_limit
@@ -327,7 +335,7 @@ def _determine_total_requests_count(
             scheduling_strategy: SchedulingStrategy,
             max_duration: Optional[float],
             max_number: Optional[int],
-    ) -> int:
+    ) -> Union[int, float]:
         end_number = max_number or math.inf
         try:
             # update end_number if the request_loader is finite and less than max_number
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index 800207a0..f4072c5d 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -1,6 +1,5 @@
 import asyncio
 import math
-import multiprocessing
 import multiprocessing.queues
 import queue
 import time
@@ -8,6 +7,7 @@
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
 from datetime import timedelta
+from multiprocessing.synchronize import Event as MultiprocessingEvent
 from typing import (
     Any,
     Generic,
@@ -124,7 +124,7 @@ async def resolve(
 
     async def get_request(
         self, requests_queue: multiprocessing.Queue,
-            shutdown_event: Optional[multiprocessing.Event] = None,
+            shutdown_event: Optional[MultiprocessingEvent] = None,
             process_id: Optional[int] = None,
     ) -> Optional[WorkerProcessRequest[RequestT]]:
         if shutdown_event is not None and process_id is None:
@@ -186,7 +186,8 @@ async def resolve_scheduler_request(
             await asyncio.sleep(wait_time)
 
         info.worker_start = time.time()
-        request_start_result = WorkerProcessResult(
+        request_start_result: WorkerProcessResult[RequestT, ResponseT] = \
+            WorkerProcessResult(
             type_="request_start",
             request=request,
             response=None,
@@ -215,7 +216,7 @@ def process_loop_synchronous(
         requests_queue: multiprocessing.Queue,
         results_queue: multiprocessing.Queue,
         process_id: int,
-        shutdown_event: Optional[multiprocessing.Event] = None,
+        shutdown_event: Optional[MultiprocessingEvent] = None,
     ):
         async def _process_runner():
             while (
@@ -256,7 +257,7 @@ def process_loop_asynchronous(
         results_queue: multiprocessing.Queue,
         max_concurrency: int,
         process_id: int,
-        shutdown_event: Optional[multiprocessing.Event] = None,
+        shutdown_event: Optional[MultiprocessingEvent] = None,
     ):
         async def _process_runner():
             pending = asyncio.Semaphore(max_concurrency)
@@ -355,7 +356,7 @@ def process_loop_synchronous(
         requests_queue: multiprocessing.Queue,
         results_queue: multiprocessing.Queue,
         process_id: int,
-        shutdown_event: Optional[multiprocessing.Event] = None
+        shutdown_event: Optional[MultiprocessingEvent] = None
     ):
         asyncio.run(self.backend.validate())
         super().process_loop_synchronous(
@@ -371,7 +372,7 @@ def process_loop_asynchronous(
         results_queue: multiprocessing.Queue,
         max_concurrency: int,
         process_id: int,
-        shutdown_event: Optional[multiprocessing.Event] = None
+        shutdown_event: Optional[MultiprocessingEvent] = None
     ):
         asyncio.run(self.backend.validate())
         super().process_loop_asynchronous(

From 4bda8cf20c118ca3ecf0dc6b3d11813a0556e5db Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Thu, 22 May 2025 10:11:42 +0300
Subject: [PATCH 20/37] Remove spammy log + bugfix

---
 src/guidellm/scheduler/scheduler.py | 1 -
 src/guidellm/scheduler/worker.py    | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index ceffecd3..4097cfed 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -177,7 +177,6 @@ async def run(
                             logger.info(f"Max error rate of "
                                         f"({iter_result.run_info.max_error_rate}) "
                                         f"reached, sending shutdown signal")
-                        logger.info("Itter is not None")
                         yield iter_result
 
                     # yield control to the event loop
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index f4072c5d..bc77a11b 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -281,6 +281,7 @@ def _task_done(_: asyncio.Task):
 
                 if shutdown_event and shutdown_event.is_set():
                     logger.info(f"Shutdown signal received in future {process_id}")
+                    pending.release()
                     break
                 task = asyncio.create_task(
                     self.resolve_scheduler_request(

From 26319a5c89fba8105709a46811fd95d5b5f1f33d Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Thu, 22 May 2025 15:39:57 +0300
Subject: [PATCH 21/37] Sleep interminetly

---
 src/guidellm/scheduler/worker.py | 47 +++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index bc77a11b..41b4423d 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -165,6 +165,7 @@ async def resolve_scheduler_request(
         timeout_time: float,
         results_queue: multiprocessing.Queue,
         process_id: int,
+        shutdown_event: Optional[MultiprocessingEvent] = None,
     ):
         info = SchedulerRequestInfo(
             targeted_start_time=start_time,
@@ -183,7 +184,21 @@ async def resolve_scheduler_request(
         asyncio.create_task(self.send_result(results_queue, request_scheduled_result))
 
         if (wait_time := start_time - time.time()) > 0:
-            await asyncio.sleep(wait_time)
+            if shutdown_event is None:
+                await asyncio.sleep(wait_time)
+            else:
+                shutdown_signal_received = \
+                    await self._sleep_intermittently_until_timestamp_or_shutdown(
+                       sleep_until_timestamp=start_time,
+                       shutdown_event=shutdown_event,
+                    )
+                if shutdown_signal_received:
+                    logger.info(
+                        "Received shutdown signal "
+                        "while waiting to start "
+                        f"|| Process ID {process_id}"
+                    )
+                    return
 
         info.worker_start = time.time()
         request_start_result: WorkerProcessResult[RequestT, ResponseT] = \
@@ -211,6 +226,18 @@ async def resolve_scheduler_request(
         )
         asyncio.create_task(self.send_result(results_queue, result))
 
+    async def _sleep_intermittently_until_timestamp_or_shutdown(
+            self,
+            sleep_until_timestamp: float,
+            shutdown_event: MultiprocessingEvent,
+    ) -> bool:
+        delta = timedelta(seconds=10).total_seconds()
+        while time.time() < sleep_until_timestamp:
+            await asyncio.sleep(delta)
+            if shutdown_event.is_set():
+                return True
+        return False
+
     def process_loop_synchronous(
         self,
         requests_queue: multiprocessing.Queue,
@@ -240,6 +267,7 @@ async def _process_runner():
                     timeout_time=process_request.timeout_time,
                     results_queue=results_queue,
                     process_id=process_id,
+                    shutdown_event=shutdown_event,
                 )
 
         try:
@@ -271,10 +299,26 @@ async def _process_runner():
                     shutdown_event=shutdown_event,
                     process_id=process_id)
             ) is not None:
+                if shutdown_event and shutdown_event.is_set():
+                    logger.error("This shouldn't happen! "
+                                 "We should catch the "
+                                 "shutdown in the get wrapper")
+                    logger.info(f"Shutdown signal received"
+                                f" in future {process_id}")
+                    break
+
                 dequeued_time = time.time()
+                logger.debug(f"Dequeued Process ID {process_id} || "
+                             f"Timestamp {dequeued_time} || "
+                             f"Semaphore {pending._value}/{max_concurrency}")
 
                 await pending.acquire()
 
+                lock_acquired_at = time.time()
+                logger.debug(f"Lock acquired Process ID {process_id} ||"
+                             f" Timestamp {lock_acquired_at} ||"
+                             f" Semaphore {pending._value}/{max_concurrency}")
+
                 def _task_done(_: asyncio.Task):
                     nonlocal pending
                     pending.release()
@@ -292,6 +336,7 @@ def _task_done(_: asyncio.Task):
                         timeout_time=process_request.timeout_time,
                         results_queue=results_queue,
                         process_id=process_id,
+                        shutdown_event=shutdown_event,
                     )
                 )
                 task.add_done_callback(_task_done)

From 09925a40c9d3ef1fc4e6ba1d23ed88442fd173ed Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Thu, 22 May 2025 15:43:51 +0300
Subject: [PATCH 22/37] Add missing error log

---
 src/guidellm/scheduler/worker.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index 41b4423d..6883f739 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -254,6 +254,9 @@ async def _process_runner():
                 )
             ) is not None:
                 if shutdown_event and shutdown_event.is_set():
+                    logger.error("This shouldn't happen! "
+                                 "We should catch the "
+                                 "shutdown in the get wrapper")
                     logger.info(f"Shutdown signal received in future {process_id}")
                     break
 

From fa562587b0bfdcf2441eb6ef05a88e21c1b97bd0 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Thu, 22 May 2025 16:22:12 +0300
Subject: [PATCH 23/37] linting fixes

---
 src/guidellm/scheduler/worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index 6883f739..f37b7708 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -313,14 +313,14 @@ async def _process_runner():
                 dequeued_time = time.time()
                 logger.debug(f"Dequeued Process ID {process_id} || "
                              f"Timestamp {dequeued_time} || "
-                             f"Semaphore {pending._value}/{max_concurrency}")
+                             f"Semaphore {pending._value}/{max_concurrency}")  # noqa: SLF001
 
                 await pending.acquire()
 
                 lock_acquired_at = time.time()
                 logger.debug(f"Lock acquired Process ID {process_id} ||"
                              f" Timestamp {lock_acquired_at} ||"
-                             f" Semaphore {pending._value}/{max_concurrency}")
+                             f" Semaphore {pending._value}/{max_concurrency}")  # noqa: SLF001
 
                 def _task_done(_: asyncio.Task):
                     nonlocal pending

From 3361d2f28e41229696e2586894f7bfa4ad16bf19 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Sun, 1 Jun 2025 12:55:31 +0300
Subject: [PATCH 24/37] WIP CR Fixes

---
 src/guidellm/benchmark/benchmark.py |   3 +-
 src/guidellm/request/loader.py      |   6 +-
 src/guidellm/scheduler/scheduler.py |  82 ++++----
 src/guidellm/scheduler/worker.py    | 288 +++++++++++++---------------
 4 files changed, 188 insertions(+), 191 deletions(-)

diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
index 40ffefba..d33e6a56 100644
--- a/src/guidellm/benchmark/benchmark.py
+++ b/src/guidellm/benchmark/benchmark.py
@@ -219,7 +219,8 @@ class BenchmarkRunStats(StandardBaseModel):
     error_rate: float = Field(
         description=(
             "The number of errored requests divided by the number "
-            "of errored requests. This can be higher than max_error_rate "
+            "of successful and errored requests. "
+            "This can be higher than max_error_rate "
             "(if applicable) cause it does not take into "
             "account incomplete requests."
         )
diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py
index 62bd17ea..26a06eb7 100644
--- a/src/guidellm/request/loader.py
+++ b/src/guidellm/request/loader.py
@@ -127,9 +127,9 @@ def __len__(self) -> int:
 
         if self.iter_type != "infinite":
             raise ValueError(f"Invalid iter_type {self.iter_type}")
-        raise GetInfiniteDatasetLengthError(f"Dataset {self.data} is "
-                                            f"infinite and thus "
-                                            f"unable to determine length")
+        raise GetInfiniteDatasetLengthError(
+            f"Dataset {self.data} is infinite and thus unable to determine length"
+        )
 
     @property
     def description(self) -> GenerativeRequestLoaderDescription:
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 4097cfed..102ebd69 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -110,10 +110,9 @@ async def run(
             Each SchedulerResult object contains information about the request,
             the response, and the run information.
         """
-        self._validate_scheduler_params(scheduling_strategy,
-                                        max_duration,
-                                        max_error_rate,
-                                        max_number)
+        self._validate_scheduler_params(
+            scheduling_strategy, max_duration, max_error_rate, max_number
+        )
 
         with (
             multiprocessing.Manager() as manager,
@@ -122,13 +121,16 @@ async def run(
             ) as executor,
         ):
             requests_iter: Optional[Iterator[Any]] = None
-            futures, requests_queue, responses_queue, shutdown_event = \
-                await self._start_processes(
-                    manager, executor, scheduling_strategy, max_error_rate is not None)
+            (
+                futures,
+                requests_queue,
+                responses_queue,
+                shutdown_event,
+            ) = await self._start_processes(
+                manager, executor, scheduling_strategy, max_error_rate is not None
+            )
             if shutdown_event and shutdown_event.is_set():
-                raise RuntimeError(
-                    "shutdown_event is set before starting scheduling"
-                )
+                raise RuntimeError("shutdown_event is set before starting scheduling")
             run_info, requests_iter, times_iter = self._run_setup(
                 futures, scheduling_strategy, max_number, max_duration, max_error_rate
             )
@@ -166,17 +168,23 @@ async def run(
                         run_info,
                     )
                     if iter_result is not None:
-                        if iter_result.request_info.errored \
-                        and not iter_result.request_info.canceled \
-                        and self._is_max_error_rate_reached(iter_result.run_info):
+                        if (
+                            iter_result.request_info.errored
+                            and not iter_result.request_info.canceled
+                            and self._is_max_error_rate_reached(iter_result.run_info)
+                        ):
                             if shutdown_event is None:
-                                raise RuntimeError("We've reached max_error_rate "
-                                                   "but shutdown_event is corrupt")
+                                raise RuntimeError(
+                                    "We've reached max_error_rate "
+                                    "but shutdown_event is corrupt"
+                                )
                             shutdown_event.set()
                             max_error_rate_reached = True
-                            logger.info(f"Max error rate of "
-                                        f"({iter_result.run_info.max_error_rate}) "
-                                        f"reached, sending shutdown signal")
+                            logger.info(
+                                f"Max error rate of "
+                                f"({iter_result.run_info.max_error_rate}) "
+                                f"reached, sending shutdown signal"
+                            )
                         yield iter_result
 
                     # yield control to the event loop
@@ -192,14 +200,14 @@ async def run(
             await self._stop_processes(futures, requests_queue)
 
     def _validate_scheduler_params(
-            self,
-            scheduling_strategy: SchedulingStrategy,
-            max_duration: Optional[float],
-            max_error_rate: Optional[float],
-            max_number: Optional[int]
+        self,
+        scheduling_strategy: SchedulingStrategy,
+        max_duration: Optional[float],
+        max_error_rate: Optional[float],
+        max_number: Optional[int],
     ) -> None:
         if scheduling_strategy is None or not isinstance(
-                scheduling_strategy, SchedulingStrategy
+            scheduling_strategy, SchedulingStrategy
         ):
             raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}")
         if max_number is not None and max_number < 1:
@@ -213,8 +221,10 @@ def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool:
         if run_info.max_error_rate is None:
             return False
         current_error_rate = run_info.errored_requests / run_info.end_number
-        logger.info(f"Current error rate {current_error_rate} "
-                    f"i.e total_finished [success / error] / max total possible")
+        logger.info(
+            f"Current error rate {current_error_rate} "
+            f"i.e total_finished [success / error] / max total possible"
+        )
         return run_info.max_error_rate < current_error_rate
 
     async def _start_processes(
@@ -222,12 +232,12 @@ async def _start_processes(
         manager,
         executor: ProcessPoolExecutor,
         scheduling_strategy: SchedulingStrategy,
-        create_shutdown_event: bool = False
+        create_shutdown_event: bool = False,
     ) -> tuple[
         list[asyncio.Future],
         multiprocessing.Queue,
         multiprocessing.Queue,
-        Optional[MultiprocessingEvent]
+        Optional[MultiprocessingEvent],
     ]:
         await self.worker.prepare_multiprocessing()
         shutdown_event = manager.Event() if create_shutdown_event else None
@@ -309,8 +319,10 @@ def _run_setup(
         )
 
         if end_number == math.inf and max_error_rate is not None:
-            logger.warning("max_error_rate will be ignored "
-                           "because end_number can not be determined.")
+            logger.warning(
+                "max_error_rate will be ignored "
+                "because end_number can not be determined."
+            )
 
         if end_number == math.inf and end_time is None:
             logger.warning(
@@ -324,16 +336,16 @@ def _run_setup(
             end_number=end_number,
             processes=len(processes),
             strategy=scheduling_strategy,
-            max_error_rate=max_error_rate
+            max_error_rate=max_error_rate,
         )
 
         return info, requests_iter, times_iter
 
     def _determine_total_requests_count(
-            self,
-            scheduling_strategy: SchedulingStrategy,
-            max_duration: Optional[float],
-            max_number: Optional[int],
+        self,
+        scheduling_strategy: SchedulingStrategy,
+        max_duration: Optional[float],
+        max_number: Optional[int],
     ) -> Union[int, float]:
         end_number = max_number or math.inf
         try:
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index f37b7708..f80afb33 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -123,31 +123,10 @@ async def resolve(
         ...
 
     async def get_request(
-        self, requests_queue: multiprocessing.Queue,
-            shutdown_event: Optional[MultiprocessingEvent] = None,
-            process_id: Optional[int] = None,
+        self,
+        requests_queue: multiprocessing.Queue,
     ) -> Optional[WorkerProcessRequest[RequestT]]:
-        if shutdown_event is not None and process_id is None:
-            logger.warning("shutdown_event is not None and process_id "
-                           "is None which makes it hard to debug")
-
-        def _get_queue_intermittently():
-            if shutdown_event is None:
-                raise ValueError("Shouldn't use _get_queue_intermittently "
-                                 "if there's no shutdown_even")
-            while True:
-                try:
-                    get_timeout = timedelta(seconds=1).total_seconds()
-                    return requests_queue.get(timeout=get_timeout)
-                except queue.Empty:
-                    if shutdown_event.is_set():
-                        logger.info(f"Shutdown signal received in future {process_id}")
-                        return None
-
-        get_method = _get_queue_intermittently \
-            if shutdown_event is not None \
-            else requests_queue.get
-        return await asyncio.to_thread(get_method)  # type: ignore[attr-defined]
+        return await asyncio.to_thread(requests_queue.get)  # type: ignore[attr-defined]
 
     async def send_result(
         self,
@@ -165,7 +144,6 @@ async def resolve_scheduler_request(
         timeout_time: float,
         results_queue: multiprocessing.Queue,
         process_id: int,
-        shutdown_event: Optional[MultiprocessingEvent] = None,
     ):
         info = SchedulerRequestInfo(
             targeted_start_time=start_time,
@@ -174,39 +152,27 @@ async def resolve_scheduler_request(
             scheduled_time=time.time(),
             process_id=process_id,
         )
-        request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = \
+        request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = (
             WorkerProcessResult(
-            type_="request_scheduled",
-            request=request,
-            response=None,
-            info=info,
+                type_="request_scheduled",
+                request=request,
+                response=None,
+                info=info,
+            )
         )
         asyncio.create_task(self.send_result(results_queue, request_scheduled_result))
 
         if (wait_time := start_time - time.time()) > 0:
-            if shutdown_event is None:
-                await asyncio.sleep(wait_time)
-            else:
-                shutdown_signal_received = \
-                    await self._sleep_intermittently_until_timestamp_or_shutdown(
-                       sleep_until_timestamp=start_time,
-                       shutdown_event=shutdown_event,
-                    )
-                if shutdown_signal_received:
-                    logger.info(
-                        "Received shutdown signal "
-                        "while waiting to start "
-                        f"|| Process ID {process_id}"
-                    )
-                    return
+            await asyncio.sleep(wait_time)
 
         info.worker_start = time.time()
-        request_start_result: WorkerProcessResult[RequestT, ResponseT] = \
+        request_start_result: WorkerProcessResult[RequestT, ResponseT] = (
             WorkerProcessResult(
-            type_="request_start",
-            request=request,
-            response=None,
-            info=info,
+                type_="request_start",
+                request=request,
+                response=None,
+                info=info,
+            )
         )
         asyncio.create_task(self.send_result(results_queue, request_start_result))
 
@@ -226,53 +192,57 @@ async def resolve_scheduler_request(
         )
         asyncio.create_task(self.send_result(results_queue, result))
 
-    async def _sleep_intermittently_until_timestamp_or_shutdown(
+    def run_process(
             self,
-            sleep_until_timestamp: float,
-            shutdown_event: MultiprocessingEvent,
-    ) -> bool:
-        delta = timedelta(seconds=10).total_seconds()
-        while time.time() < sleep_until_timestamp:
-            await asyncio.sleep(delta)
-            if shutdown_event.is_set():
-                return True
-        return False
-
-    def process_loop_synchronous(
-        self,
-        requests_queue: multiprocessing.Queue,
-        results_queue: multiprocessing.Queue,
-        process_id: int,
-        shutdown_event: Optional[MultiprocessingEvent] = None,
+            type_: Literal["synchronous", "asynchronous"],
+            requests_queue: multiprocessing.Queue,
+            results_queue: multiprocessing.Queue,
+            shutdown_event: multiprocessing.Event,
+            shutdown_poll_interval: float,
+            process_id: int,
+            max_concurrency: int,
     ):
         async def _process_runner():
-            while (
-                process_request := await self.get_request(
+            if type_ == "synchronous":
+                loop_task = asyncio.create_task(self._process_synchronous_requests_loop(
                     requests_queue=requests_queue,
-                    shutdown_event=shutdown_event,
+                    results_queue=results_queue,
                     process_id=process_id,
-                )
-            ) is not None:
-                if shutdown_event and shutdown_event.is_set():
-                    logger.error("This shouldn't happen! "
-                                 "We should catch the "
-                                 "shutdown in the get wrapper")
-                    logger.info(f"Shutdown signal received in future {process_id}")
-                    break
-
-                dequeued_time = time.time()
-
-                await self.resolve_scheduler_request(
-                    request=process_request.request,
-                    queued_time=process_request.queued_time,
-                    dequeued_time=dequeued_time,
-                    start_time=process_request.start_time,
-                    timeout_time=process_request.timeout_time,
+                ), name="request_loop_processor_task")
+            elif type_ == "asynchronous":
+                loop_task = asyncio.create_task(self._process_asynchronous_requests_loop(
+                    requests_queue=requests_queue,
                     results_queue=results_queue,
+                    max_concurrency=max_concurrency,
                     process_id=process_id,
-                    shutdown_event=shutdown_event,
-                )
+                ), name="request_loop_processor_task")
+            else:
+                raise ValueError(f"Invalid process type: {type_}")
+
+            shutdown_task = asyncio.create_task(
+                self._wait_for_shutdown(shutdown_event, shutdown_poll_interval),
+                name="shutdown_task"
+            )
 
+            done, pending = await asyncio.wait(
+                [
+                    loop_task,
+                    shutdown_task,
+                ],
+                return_when=asyncio.FIRST_EXCEPTION,
+            )
+
+            for task in pending:
+                task.cancel()
+                try:
+                    await task
+                except asyncio.CancelledError:
+                    pass
+
+            for task in done:
+                task_exception = task.exception()
+                if not isinstance(task_exception, asyncio.CancelledError):
+                    raise task_exception
         try:
             asyncio.run(_process_runner())
         except Exception as exc:  # noqa: BLE001
@@ -281,78 +251,92 @@ async def _process_runner():
                 exc_info=True,
                 stack_info=True,
             )
+        finally:
+            shutdown_event.set()  # ensure shutdown event is set to stop other processes
 
-    def process_loop_asynchronous(
+    async def _wait_for_shutdown(
+            self,
+            shutdown_event: MultiprocessingEvent,
+            shutdown_poll_interval: float,
+    ):
+        while not shutdown_event.is_set():
+            await asyncio.sleep(shutdown_poll_interval)
+
+        raise asyncio.CancelledError("Shutdown event set, cancelling process loop.")
+
+    async def _process_synchronous_requests_loop(
+            self,
+            requests_queue: multiprocessing.Queue,
+            results_queue: multiprocessing.Queue,
+            process_id: int,
+    ):
+        while True:
+            process_request = await self.get_request(
+                requests_queue=requests_queue,
+            )
+
+            dequeued_time = time.time()
+
+            await self.resolve_scheduler_request(
+                request=process_request.request,
+                queued_time=process_request.queued_time,
+                dequeued_time=dequeued_time,
+                start_time=process_request.start_time,
+                timeout_time=process_request.timeout_time,
+                results_queue=results_queue,
+                process_id=process_id,
+            )
+
+    async def _process_asynchronous_requests_loop(
         self,
         requests_queue: multiprocessing.Queue,
         results_queue: multiprocessing.Queue,
         max_concurrency: int,
         process_id: int,
-        shutdown_event: Optional[MultiprocessingEvent] = None,
     ):
-        async def _process_runner():
-            pending = asyncio.Semaphore(max_concurrency)
+        pending = asyncio.Semaphore(max_concurrency)
 
-            if pending.locked():
-                raise ValueError("Async worker called with max_concurrency < 1")
+        if pending.locked():
+            raise ValueError("Async worker called with max_concurrency < 1")
 
-            while (
-                process_request := await self.get_request(
-                    requests_queue=requests_queue,
-                    shutdown_event=shutdown_event,
-                    process_id=process_id)
-            ) is not None:
-                if shutdown_event and shutdown_event.is_set():
-                    logger.error("This shouldn't happen! "
-                                 "We should catch the "
-                                 "shutdown in the get wrapper")
-                    logger.info(f"Shutdown signal received"
-                                f" in future {process_id}")
-                    break
-
-                dequeued_time = time.time()
-                logger.debug(f"Dequeued Process ID {process_id} || "
-                             f"Timestamp {dequeued_time} || "
-                             f"Semaphore {pending._value}/{max_concurrency}")  # noqa: SLF001
-
-                await pending.acquire()
-
-                lock_acquired_at = time.time()
-                logger.debug(f"Lock acquired Process ID {process_id} ||"
-                             f" Timestamp {lock_acquired_at} ||"
-                             f" Semaphore {pending._value}/{max_concurrency}")  # noqa: SLF001
-
-                def _task_done(_: asyncio.Task):
-                    nonlocal pending
-                    pending.release()
-
-                if shutdown_event and shutdown_event.is_set():
-                    logger.info(f"Shutdown signal received in future {process_id}")
-                    pending.release()
-                    break
-                task = asyncio.create_task(
-                    self.resolve_scheduler_request(
-                        request=process_request.request,
-                        queued_time=process_request.queued_time,
-                        dequeued_time=dequeued_time,
-                        start_time=process_request.start_time,
-                        timeout_time=process_request.timeout_time,
-                        results_queue=results_queue,
-                        process_id=process_id,
-                        shutdown_event=shutdown_event,
-                    )
-                )
-                task.add_done_callback(_task_done)
-                await asyncio.sleep(0)  # enable start task immediately
+        while True:
+            process_request = await self.get_request(
+                requests_queue=requests_queue,
+            )
 
-        try:
-            asyncio.run(_process_runner())
-        except Exception as exc:  # noqa: BLE001
-            logger.error(
-                f"Error in worker process {process_id}: {exc}",
-                exc_info=True,
-                stack_info=True,
+            dequeued_time = time.time()
+            logger.debug(
+                f"Dequeued Process ID {process_id} || "
+                f"Timestamp {dequeued_time} || "
+                f"Semaphore {pending._value}/{max_concurrency}"  # noqa: SLF001
+            )
+
+            await pending.acquire()
+
+            lock_acquired_at = time.time()
+            logger.debug(
+                f"Lock acquired Process ID {process_id} ||"
+                f" Timestamp {lock_acquired_at} ||"
+                f" Semaphore {pending._value}/{max_concurrency}"  # noqa: SLF001
+            )
+
+            def _task_done(_: asyncio.Task):
+                nonlocal pending
+                pending.release()
+
+            task = asyncio.create_task(
+                self.resolve_scheduler_request(
+                    request=process_request.request,
+                    queued_time=process_request.queued_time,
+                    dequeued_time=dequeued_time,
+                    start_time=process_request.start_time,
+                    timeout_time=process_request.timeout_time,
+                    results_queue=results_queue,
+                    process_id=process_id,
+                )
             )
+            task.add_done_callback(_task_done)
+            await asyncio.sleep(0)  # enable start task immediately
 
 
 class GenerativeRequestsWorkerDescription(WorkerDescription):
@@ -405,7 +389,7 @@ def process_loop_synchronous(
         requests_queue: multiprocessing.Queue,
         results_queue: multiprocessing.Queue,
         process_id: int,
-        shutdown_event: Optional[MultiprocessingEvent] = None
+        shutdown_event: Optional[MultiprocessingEvent] = None,
     ):
         asyncio.run(self.backend.validate())
         super().process_loop_synchronous(
@@ -421,7 +405,7 @@ def process_loop_asynchronous(
         results_queue: multiprocessing.Queue,
         max_concurrency: int,
         process_id: int,
-        shutdown_event: Optional[MultiprocessingEvent] = None
+        shutdown_event: Optional[MultiprocessingEvent] = None,
     ):
         asyncio.run(self.backend.validate())
         super().process_loop_asynchronous(

From c134f66045167b64e9f36b074ebc5e894ea55519 Mon Sep 17 00:00:00 2001
From: markvaykhansky <mark@jounce.io>
Date: Sun, 1 Jun 2025 15:12:20 +0300
Subject: [PATCH 25/37] WIP

---
 .pre-commit-config.yaml             | 74 ++++++++++++++---------------
 src/guidellm/scheduler/scheduler.py | 19 ++------
 src/guidellm/scheduler/worker.py    | 17 ++++---
 3 files changed, 52 insertions(+), 58 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e60e2899..9b9df3fd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,40 +4,40 @@ repos:
   hooks:
   - id: trailing-whitespace
   - id: end-of-file-fixer
-- repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.11.7
-  hooks:
-    - id: ruff
-- repo: https://github.com/pre-commit/mirrors-mypy
-  rev: v1.15.0
-  hooks:
-  - id: mypy
-    args: [--check-untyped-defs]
-    additional_dependencies:
-      [
-        # main dependencies
-        click,
-        datasets,
-        ftfy,
-        loguru,
-        numpy,
-        pillow,
-        pydantic,
-        pydantic_settings,
-        pyyaml,
-        respx,
-        rich,
-        setuptools,
-        setuptools-git-versioning,
-        transformers,
-
-        # dev dependencies
-        pytest,
-        pydantic_settings,
-
-        # types
-        types-click,
-        types-PyYAML,
-        types-requests,
-        types-toml,
-      ]
+#- repo: https://github.com/astral-sh/ruff-pre-commit
+#  rev: v0.11.7
+#  hooks:
+#    - id: ruff
+#- repo: https://github.com/pre-commit/mirrors-mypy
+#  rev: v1.15.0
+#  hooks:
+#  - id: mypy
+#    args: [--check-untyped-defs]
+#    additional_dependencies:
+#      [
+#        # main dependencies
+#        click,
+#        datasets,
+#        ftfy,
+#        loguru,
+#        numpy,
+#        pillow,
+#        pydantic,
+#        pydantic_settings,
+#        pyyaml,
+#        respx,
+#        rich,
+#        setuptools,
+#        setuptools-git-versioning,
+#        transformers,
+#
+#        # dev dependencies
+#        pytest,
+#        pydantic_settings,
+#
+#        # types
+#        types-click,
+#        types-PyYAML,
+#        types-requests,
+#        types-toml,
+#      ]
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 102ebd69..0d27c94e 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -1,3 +1,4 @@
+from datetime import timedelta
 import asyncio
 import math
 import multiprocessing
@@ -269,27 +270,17 @@ async def _start_processes(
         futures = []
         loop = asyncio.get_event_loop()
         for id_, requests_limit in zip(process_ids, process_requests_limits):
-            if scheduling_strategy.processing_mode == "sync":
+            if scheduling_strategy.processing_mode in ["sync", "async"]:
                 futures.append(
                     loop.run_in_executor(
                         executor,
-                        self.worker.process_loop_synchronous,
+                        self.worker.run_process,
                         requests_queue,
                         responses_queue,
-                        id_,
                         shutdown_event,
-                    )
-                )
-            elif scheduling_strategy.processing_mode == "async":
-                futures.append(
-                    loop.run_in_executor(
-                        executor,
-                        self.worker.process_loop_asynchronous,
-                        requests_queue,
-                        responses_queue,
-                        requests_limit,
+                        timedelta(seconds=10).total_seconds(),
                         id_,
-                        shutdown_event,
+                        requests_limit,
                     )
                 )
             else:
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index f80afb33..b458224f 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -194,22 +194,25 @@ async def resolve_scheduler_request(
 
     def run_process(
             self,
-            type_: Literal["synchronous", "asynchronous"],
+            type_: Literal["sync", "async"],
             requests_queue: multiprocessing.Queue,
             results_queue: multiprocessing.Queue,
             shutdown_event: multiprocessing.Event,
-            shutdown_poll_interval: float,
+            shutdown_poll_interval_seconds: float,
             process_id: int,
-            max_concurrency: int,
+            max_concurrency: Optional[int] = None,
     ):
         async def _process_runner():
-            if type_ == "synchronous":
+            if type_ == "sync":
                 loop_task = asyncio.create_task(self._process_synchronous_requests_loop(
                     requests_queue=requests_queue,
                     results_queue=results_queue,
                     process_id=process_id,
                 ), name="request_loop_processor_task")
-            elif type_ == "asynchronous":
+            elif type_ == "async":
+                if max_concurrency is None:
+                    raise ValueError("max_concurrency must be set "
+                                     "for async processor")
                 loop_task = asyncio.create_task(self._process_asynchronous_requests_loop(
                     requests_queue=requests_queue,
                     results_queue=results_queue,
@@ -220,8 +223,8 @@ async def _process_runner():
                 raise ValueError(f"Invalid process type: {type_}")
 
             shutdown_task = asyncio.create_task(
-                self._wait_for_shutdown(shutdown_event, shutdown_poll_interval),
-                name="shutdown_task"
+                self._wait_for_shutdown(shutdown_event, shutdown_poll_interval_seconds),
+                name="shutdown_task",
             )
 
             done, pending = await asyncio.wait(

From 464ebe359c05b711970a478712edb1869ce9c0ea Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Sun, 1 Jun 2025 15:27:43 +0300
Subject: [PATCH 26/37] WIP

---
 src/guidellm/scheduler/scheduler.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 0d27c94e..859366f3 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -251,6 +251,7 @@ async def _start_processes(
             scheduling_strategy.processes_limit,
             scheduling_strategy.processing_requests_limit,
         )
+        num_processes = 1
         requests_limit_split = (
             scheduling_strategy.processing_requests_limit
             // scheduling_strategy.processes_limit
@@ -275,6 +276,7 @@ async def _start_processes(
                     loop.run_in_executor(
                         executor,
                         self.worker.run_process,
+                        scheduling_strategy.processing_mode,
                         requests_queue,
                         responses_queue,
                         shutdown_event,

From 883593aece54b4ad60d4e8846bd49a50402579a0 Mon Sep 17 00:00:00 2001
From: markvaykhansky <mark@jounce.io>
Date: Tue, 3 Jun 2025 10:25:04 +0300
Subject: [PATCH 27/37] wip

---
 src/guidellm/backend/openai.py      |  12 +--
 src/guidellm/objects/pydantic.py    |  10 +-
 src/guidellm/scheduler/repro.py     | 138 ++++++++++++++++++++++++++++
 src/guidellm/scheduler/scheduler.py |   8 +-
 src/guidellm/scheduler/worker.py    |  18 +++-
 5 files changed, 168 insertions(+), 18 deletions(-)
 create mode 100644 src/guidellm/scheduler/repro.py

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
index e3f23963..5aec53fa 100644
--- a/src/guidellm/backend/openai.py
+++ b/src/guidellm/backend/openai.py
@@ -572,12 +572,12 @@ async def _iterative_completions_request(
 
             async for line in stream.aiter_lines():
                 iter_time = time.time()
-                logger.debug(
-                    "{} request: {} recieved iter response line: {}",
-                    self.__class__.__name__,
-                    request_id,
-                    line,
-                )
+                # logger.debug(
+                #     "{} request: {} recieved iter response line: {}",
+                #     self.__class__.__name__,
+                #     request_id,
+                #     line,
+                # )
 
                 if not line or not line.strip().startswith("data:"):
                     continue
diff --git a/src/guidellm/objects/pydantic.py b/src/guidellm/objects/pydantic.py
index 8365be33..92658e17 100644
--- a/src/guidellm/objects/pydantic.py
+++ b/src/guidellm/objects/pydantic.py
@@ -21,11 +21,11 @@ class StandardBaseModel(BaseModel):
 
     def __init__(self, /, **data: Any) -> None:
         super().__init__(**data)
-        logger.debug(
-            "Initialized new instance of {} with data: {}",
-            self.__class__.__name__,
-            data,
-        )
+        # logger.debug(
+        #     "Initialized new instance of {} with data: {}",
+        #     self.__class__.__name__,
+        #     data,
+        # )
 
 
 SuccessfulT = TypeVar("SuccessfulT")
diff --git a/src/guidellm/scheduler/repro.py b/src/guidellm/scheduler/repro.py
new file mode 100644
index 00000000..f9f76830
--- /dev/null
+++ b/src/guidellm/scheduler/repro.py
@@ -0,0 +1,138 @@
+import asyncio
+import multiprocessing
+import time
+import logging
+import threading
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s',
+    datefmt='%H:%M:%S'
+)
+
+# A multiprocessing queue that will remain empty
+# Naming it mp_queue to distinguish from asyncio.Queue
+mp_queue = multiprocessing.Queue()
+
+
+async def get_item_from_mp_queue(q: multiprocessing.Queue, worker_id: int):
+    """
+    Coroutine that tries to get an item from a multiprocessing.Queue
+    using asyncio.to_thread.
+    """
+    logging.info(f"Worker {worker_id}: get_item_from_mp_queue: ENTERED. Awaiting asyncio.to_thread(q.get).")
+    try:
+        # This is the blocking call in a separate thread
+        item = await asyncio.to_thread(q.get)
+        # We don't expect this to be reached if the queue is empty
+        logging.info(
+            f"Worker {worker_id}: get_item_from_mp_queue: asyncio.to_thread RETURNED NORMALLY with item: {item}.")
+        return item
+    except asyncio.CancelledError:
+        # This is where it SHOULD go if the task awaiting this coroutine is cancelled,
+        # and asyncio.to_thread correctly propagates the cancellation to its awaiter.
+        logging.error(
+            f"Worker {worker_id}: get_item_from_mp_queue: CAUGHT CancelledError from asyncio.to_thread directly!")
+        raise  # Re-raise to propagate the cancellation
+    except Exception as e:
+        logging.error(f"Worker {worker_id}: get_item_from_mp_queue: CAUGHT an UNEXPECTED EXCEPTION {type(e)}: {e}",
+                      exc_info=True)
+        raise
+    finally:
+        # This finally block will execute. The key is whether the CancelledError was caught above.
+        logging.info(f"Worker {worker_id}: get_item_from_mp_queue: EXITED (finally block).")
+
+
+async def worker_coroutine(worker_id: int, q: multiprocessing.Queue):
+    """
+    The main coroutine for our worker task. It will try to get an item
+    from the queue.
+    """
+    logging.info(f"Worker {worker_id}: worker_coroutine: STARTED.")
+    try:
+        logging.info(f"Worker {worker_id}: worker_coroutine: About to await get_item_from_mp_queue.")
+        # This is the await point where CancelledError should be injected
+        # if this worker_coroutine task is cancelled.
+        await get_item_from_mp_queue(q, worker_id)
+        logging.info(f"Worker {worker_id}: worker_coroutine: get_item_from_mp_queue completed (unexpectedly).")
+    except asyncio.CancelledError:
+        logging.error(f"Worker {worker_id}: worker_coroutine: SUCCESSFULLY CAUGHT CancelledError.")
+        # Perform any task-specific cleanup here if needed
+    except Exception as e:
+        logging.error(f"Worker {worker_id}: worker_coroutine: CAUGHT UNEXPECTED EXCEPTION {type(e)}: {e}",
+                      exc_info=True)
+    finally:
+        logging.info(f"Worker {worker_id}: worker_coroutine: FINISHED (finally block).")
+
+
+async def main_orchestrator():
+    """
+    Orchestrates the test: creates, runs, and cancels the worker.
+    """
+    logging.info("Main Orchestrator: Starting worker task.")
+    worker_task = asyncio.create_task(worker_coroutine(1, mp_queue), name="WorkerCoroutine-1")
+
+    # Give the worker task a moment to start and block on the queue
+    logging.info("Main Orchestrator: Sleeping for 1 second to let worker block...")
+    await asyncio.sleep(1)
+
+    logging.info(f"Main Orchestrator: Current active threads: {[t.name for t_ in threading.enumerate()]}...")
+
+    # Cancel the worker task
+    print("Main Orchestrator: Cancelling worker_task...")
+    worker_task.cancel()
+
+    # Wait for the worker task to finish, with a timeout.
+    # If cancellation works as expected, worker_task should complete (by handling CancelledError)
+    # well before the timeout.
+    # If it gets stuck, asyncio.TimeoutError will be raised.
+    timeout_seconds = 5.0
+    logging.info(f"Main Orchestrator: Awaiting worker_task with timeout {timeout_seconds}s...")
+    try:
+        await asyncio.wait_for(worker_task, timeout=timeout_seconds)
+        logging.info("Main Orchestrator: worker_task completed WITHOUT timeout.")
+    except asyncio.TimeoutError:
+        logging.error(
+            f"Main Orchestrator: TIMEOUT! worker_task did not finish within {timeout_seconds}s after cancellation.")
+        logging.error(
+            f"Main Orchestrator: worker_task.done() = {worker_task.done()}, worker_task.cancelled() = {worker_task.cancelled()}")
+        # At this point, the thread running mp_queue.get() is likely still blocked.
+    except asyncio.CancelledError:
+        # This would happen if main_orchestrator itself was cancelled, not expected here.
+        logging.error("Main Orchestrator: main_orchestrator itself was cancelled (unexpected).")
+    except Exception as e:
+        logging.error(f"Main Orchestrator: An unexpected error occurred while waiting for worker_task: {e}",
+                      exc_info=True)
+    finally:
+        logging.info("Main Orchestrator: Test finished.")
+        # Note: The thread started by asyncio.to_thread for mp_queue.get()
+        # might still be alive and blocked if q.get() wasn't unblocked.
+        # It's a daemon thread by default, so it won't prevent program exit.
+        # To clean it up, one would typically put a sentinel into mp_queue.
+        # For this test, we are focused on the asyncio task cancellation.
+        logging.info(
+            f"Main Orchestrator: Final check: worker_task.done() = {worker_task.done()}, worker_task.cancelled() = {worker_task.cancelled()}")
+
+        # Attempt to unblock the queue to allow the thread to exit,
+        # though the test's focus is on the asyncio cancellation.
+        try:
+            mp_queue.put_nowait(None)  # Sentinel
+            logging.info("Main Orchestrator: Put sentinel in mp_queue to unblock thread.")
+        except Exception:
+            logging.warning("Main Orchestrator: Could not put sentinel in mp_queue.")
+
+
+if __name__ == "__main__":
+    # For multiprocessing queues to work correctly, especially on Windows/macOS
+    # with 'spawn' or 'forkserver' start methods, it's good practice
+    # to ensure the queue is created in the main process scope before tasks.
+    # In this simple script, it's fine.
+    try:
+        asyncio.run(main_orchestrator())
+    except KeyboardInterrupt:
+        logging.info("Main Orchestrator: Keyboard interrupt received.")
+    finally:
+        mp_queue.close()
+        mp_queue.join_thread()  # Ensure queue's feeder thread is joined
+        logging.info("Main Orchestrator: mp_queue resources released.")
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 859366f3..ad822036 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -154,6 +154,7 @@ async def run(
                     ):
                         # we've exhausted all requests we've wanted to run
                         # and yielded all responses
+                        logger.info("run_info.completed_requests >= run_info.created_requests")
                         break
 
                     requests_iter = self._add_requests(
@@ -198,7 +199,7 @@ async def run(
                 run_info=run_info,
             )
 
-            await self._stop_processes(futures, requests_queue)
+            await self._stop_processes(futures, shutdown_event, requests_queue)
 
     def _validate_scheduler_params(
         self,
@@ -457,10 +458,9 @@ def _check_result_ready(
     async def _stop_processes(
         self,
         futures: list[asyncio.Future],
+        shutdown_event: MultiprocessingEvent,
         requests_queue: multiprocessing.Queue,
     ):
-        for _ in futures:
-            requests_queue.put(None)
-
+        shutdown_event.set()
         logger.debug("Waiting for futures to shut down")
         await asyncio.gather(*futures)
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index b458224f..38cfecbd 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -223,7 +223,10 @@ async def _process_runner():
                 raise ValueError(f"Invalid process type: {type_}")
 
             shutdown_task = asyncio.create_task(
-                self._wait_for_shutdown(shutdown_event, shutdown_poll_interval_seconds),
+                self._wait_for_shutdown(
+                    shutdown_event=shutdown_event,
+                    shutdown_poll_interval=shutdown_poll_interval_seconds
+                ),
                 name="shutdown_task",
             )
 
@@ -236,7 +239,9 @@ async def _process_runner():
             )
 
             for task in pending:
-                task.cancel()
+                logger.debug(f"Cancelling task {task.get_name()}")
+                cancel_result = task.cancel()
+                logger.debug(f"{'Task is already done or canceled' if not cancel_result else 'sent cancel signal'}")
                 try:
                     await task
                 except asyncio.CancelledError:
@@ -265,6 +270,8 @@ async def _wait_for_shutdown(
         while not shutdown_event.is_set():
             await asyncio.sleep(shutdown_poll_interval)
 
+        logger.debug("Shutdown signal received")
+        raise ValueError("kaki")
         raise asyncio.CancelledError("Shutdown event set, cancelling process loop.")
 
     async def _process_synchronous_requests_loop(
@@ -290,6 +297,9 @@ async def _process_synchronous_requests_loop(
                 process_id=process_id,
             )
 
+        logger.debug("Done processing synchronous loop")
+
+
     async def _process_asynchronous_requests_loop(
         self,
         requests_queue: multiprocessing.Queue,
@@ -303,6 +313,7 @@ async def _process_asynchronous_requests_loop(
             raise ValueError("Async worker called with max_concurrency < 1")
 
         while True:
+            logger.info("Awaiting request...")
             process_request = await self.get_request(
                 requests_queue=requests_queue,
             )
@@ -315,7 +326,6 @@ async def _process_asynchronous_requests_loop(
             )
 
             await pending.acquire()
-
             lock_acquired_at = time.time()
             logger.debug(
                 f"Lock acquired Process ID {process_id} ||"
@@ -341,6 +351,8 @@ def _task_done(_: asyncio.Task):
             task.add_done_callback(_task_done)
             await asyncio.sleep(0)  # enable start task immediately
 
+        logger.debug("Done processing asynchronous loop")
+
 
 class GenerativeRequestsWorkerDescription(WorkerDescription):
     type_: Literal["generative_requests_worker"] = "generative_requests_worker"  # type: ignore[assignment]

From 35abac72643ee816504d2431fa48fe1114866b2c Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 4 Jun 2025 09:30:48 +0300
Subject: [PATCH 28/37] WIP - Stuck after shutdown signal received

---
 src/guidellm/scheduler/worker.py | 54 ++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index 38cfecbd..9b6f283d 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -123,10 +123,34 @@ async def resolve(
         ...
 
     async def get_request(
-        self,
-        requests_queue: multiprocessing.Queue,
+            self, requests_queue: multiprocessing.Queue,
+            shutdown_event: MultiprocessingEvent,
+            process_id: int,
+            shutdown_poll_interval_seconds: float,
     ) -> Optional[WorkerProcessRequest[RequestT]]:
-        return await asyncio.to_thread(requests_queue.get)  # type: ignore[attr-defined]
+        # We need to check shutdown_event intermittently cause
+        # if we simply use asyncio.to_thread(requests_queue.get)
+        # the cancellation task doesn't propagate because the
+        # asyncio.to_thread is blocking
+        return await asyncio.to_thread(requests_queue.get)
+        # def _get_queue_intermittently():
+        #     while True:
+        #         try:
+        #             return requests_queue.get(timeout=shutdown_poll_interval_seconds)
+        #         except queue.Empty:
+        #             logger.info("Checking shutdown even is set in get_request")
+        #             if shutdown_event.is_set():
+        #                 logger.info(f"Shutdown signal received in future {process_id}")
+        #                 raise asyncio.CancelledError()
+        #                 # return None
+        #
+        # try:
+        #     return await asyncio.to_thread(_get_queue_intermittently)  # type: ignore[attr-defined]
+        # except asyncio.CancelledError:
+        #     logger.info("kaki")
+        #     # return None
+        #     raise
+        #     # raise
 
     async def send_result(
         self,
@@ -203,11 +227,15 @@ def run_process(
             max_concurrency: Optional[int] = None,
     ):
         async def _process_runner():
+            import threading
+            internal_shutdown_event = threading.Event()
             if type_ == "sync":
                 loop_task = asyncio.create_task(self._process_synchronous_requests_loop(
                     requests_queue=requests_queue,
                     results_queue=results_queue,
                     process_id=process_id,
+                    shutdown_event=internal_shutdown_event,
+                    shutdown_poll_interval_seconds=shutdown_poll_interval_seconds,
                 ), name="request_loop_processor_task")
             elif type_ == "async":
                 if max_concurrency is None:
@@ -218,6 +246,8 @@ async def _process_runner():
                     results_queue=results_queue,
                     max_concurrency=max_concurrency,
                     process_id=process_id,
+                    shutdown_event=internal_shutdown_event,
+                    shutdown_poll_interval_seconds=shutdown_poll_interval_seconds,
                 ), name="request_loop_processor_task")
             else:
                 raise ValueError(f"Invalid process type: {type_}")
@@ -237,10 +267,12 @@ async def _process_runner():
                 ],
                 return_when=asyncio.FIRST_EXCEPTION,
             )
+            logger.info("First exception happened")
 
             for task in pending:
                 logger.debug(f"Cancelling task {task.get_name()}")
                 cancel_result = task.cancel()
+                internal_shutdown_event.set()
                 logger.debug(f"{'Task is already done or canceled' if not cancel_result else 'sent cancel signal'}")
                 try:
                     await task
@@ -271,7 +303,6 @@ async def _wait_for_shutdown(
             await asyncio.sleep(shutdown_poll_interval)
 
         logger.debug("Shutdown signal received")
-        raise ValueError("kaki")
         raise asyncio.CancelledError("Shutdown event set, cancelling process loop.")
 
     async def _process_synchronous_requests_loop(
@@ -279,10 +310,15 @@ async def _process_synchronous_requests_loop(
             requests_queue: multiprocessing.Queue,
             results_queue: multiprocessing.Queue,
             process_id: int,
+            shutdown_event: MultiprocessingEvent,
+            shutdown_poll_interval_seconds: float,
     ):
         while True:
             process_request = await self.get_request(
                 requests_queue=requests_queue,
+                shutdown_event=shutdown_event,
+                process_id=process_id,
+                shutdown_poll_interval_seconds=shutdown_poll_interval_seconds
             )
 
             dequeued_time = time.time()
@@ -297,15 +333,14 @@ async def _process_synchronous_requests_loop(
                 process_id=process_id,
             )
 
-        logger.debug("Done processing synchronous loop")
-
-
     async def _process_asynchronous_requests_loop(
         self,
         requests_queue: multiprocessing.Queue,
         results_queue: multiprocessing.Queue,
         max_concurrency: int,
         process_id: int,
+        shutdown_event: MultiprocessingEvent,
+        shutdown_poll_interval_seconds: float,
     ):
         pending = asyncio.Semaphore(max_concurrency)
 
@@ -316,6 +351,9 @@ async def _process_asynchronous_requests_loop(
             logger.info("Awaiting request...")
             process_request = await self.get_request(
                 requests_queue=requests_queue,
+                shutdown_event=shutdown_event,
+                process_id=process_id,
+                shutdown_poll_interval_seconds=shutdown_poll_interval_seconds,
             )
 
             dequeued_time = time.time()
@@ -351,8 +389,6 @@ def _task_done(_: asyncio.Task):
             task.add_done_callback(_task_done)
             await asyncio.sleep(0)  # enable start task immediately
 
-        logger.debug("Done processing asynchronous loop")
-
 
 class GenerativeRequestsWorkerDescription(WorkerDescription):
     type_: Literal["generative_requests_worker"] = "generative_requests_worker"  # type: ignore[assignment]

From 55cf7187447dd7687502cbebabdd34633f252c54 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 4 Jun 2025 13:41:45 +0300
Subject: [PATCH 29/37] WIP - New cancellation mechanism works

---
 src/guidellm/benchmark/aggregator.py |  2 +-
 src/guidellm/scheduler/scheduler.py  | 12 ++-----
 src/guidellm/scheduler/worker.py     | 48 +++++++++++++++-------------
 3 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py
index cd725326..a17f642f 100644
--- a/src/guidellm/benchmark/aggregator.py
+++ b/src/guidellm/benchmark/aggregator.py
@@ -638,7 +638,7 @@ def _calculate_error_rate(self) -> float:
         total_successful = self.requests_stats.totals.successful.total
         total_errored = self.requests_stats.totals.errored.total
         total_finished = total_errored + total_successful
-        return total_errored / total_finished
+        return total_errored / total_finished if total_finished > 0 else 0
 
     def _compile_results(
         self,
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index ad822036..628272c1 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -175,11 +175,6 @@ async def run(
                             and not iter_result.request_info.canceled
                             and self._is_max_error_rate_reached(iter_result.run_info)
                         ):
-                            if shutdown_event is None:
-                                raise RuntimeError(
-                                    "We've reached max_error_rate "
-                                    "but shutdown_event is corrupt"
-                                )
                             shutdown_event.set()
                             max_error_rate_reached = True
                             logger.info(
@@ -199,7 +194,7 @@ async def run(
                 run_info=run_info,
             )
 
-            await self._stop_processes(futures, shutdown_event, requests_queue)
+            await self._stop_processes(futures, shutdown_event)
 
     def _validate_scheduler_params(
         self,
@@ -252,7 +247,6 @@ async def _start_processes(
             scheduling_strategy.processes_limit,
             scheduling_strategy.processing_requests_limit,
         )
-        num_processes = 1
         requests_limit_split = (
             scheduling_strategy.processing_requests_limit
             // scheduling_strategy.processes_limit
@@ -459,8 +453,8 @@ async def _stop_processes(
         self,
         futures: list[asyncio.Future],
         shutdown_event: MultiprocessingEvent,
-        requests_queue: multiprocessing.Queue,
     ):
-        shutdown_event.set()
+        if not shutdown_event.is_set():
+            shutdown_event.set()
         logger.debug("Waiting for futures to shut down")
         await asyncio.gather(*futures)
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index 9b6f283d..1b82c335 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -42,6 +42,10 @@
 ]
 
 
+class ShutdownSignalReceived(Exception):
+    pass
+
+
 @dataclass
 class WorkerProcessRequest(Generic[RequestT]):
     request: RequestT
@@ -132,25 +136,24 @@ async def get_request(
         # if we simply use asyncio.to_thread(requests_queue.get)
         # the cancellation task doesn't propagate because the
         # asyncio.to_thread is blocking
-        return await asyncio.to_thread(requests_queue.get)
-        # def _get_queue_intermittently():
-        #     while True:
-        #         try:
-        #             return requests_queue.get(timeout=shutdown_poll_interval_seconds)
-        #         except queue.Empty:
-        #             logger.info("Checking shutdown even is set in get_request")
-        #             if shutdown_event.is_set():
-        #                 logger.info(f"Shutdown signal received in future {process_id}")
-        #                 raise asyncio.CancelledError()
-        #                 # return None
-        #
-        # try:
-        #     return await asyncio.to_thread(_get_queue_intermittently)  # type: ignore[attr-defined]
-        # except asyncio.CancelledError:
-        #     logger.info("kaki")
-        #     # return None
-        #     raise
-        #     # raise
+        def _get_queue_intermittently():
+            while True:
+                try:
+                    return requests_queue.get(timeout=shutdown_poll_interval_seconds)
+                except queue.Empty:
+                    logger.info("Checking shutdown even is set in get_request")
+                    if shutdown_event.is_set():
+                        logger.info(f"Shutdown signal received in future {process_id}")
+                        raise asyncio.CancelledError()
+                        # return None
+
+        try:
+            return await asyncio.to_thread(_get_queue_intermittently)  # type: ignore[attr-defined]
+        except asyncio.CancelledError:
+            logger.info("kaki")
+            # return None
+            raise
+            # raise
 
     async def send_result(
         self,
@@ -267,7 +270,7 @@ async def _process_runner():
                 ],
                 return_when=asyncio.FIRST_EXCEPTION,
             )
-            logger.info("First exception happened")
+            logger.info(f"First exception happened, done: [{[r.get_name() for r in done]}")
 
             for task in pending:
                 logger.debug(f"Cancelling task {task.get_name()}")
@@ -281,7 +284,7 @@ async def _process_runner():
 
             for task in done:
                 task_exception = task.exception()
-                if not isinstance(task_exception, asyncio.CancelledError):
+                if not isinstance(task_exception, ShutdownSignalReceived):
                     raise task_exception
         try:
             asyncio.run(_process_runner())
@@ -303,7 +306,8 @@ async def _wait_for_shutdown(
             await asyncio.sleep(shutdown_poll_interval)
 
         logger.debug("Shutdown signal received")
-        raise asyncio.CancelledError("Shutdown event set, cancelling process loop.")
+        raise ShutdownSignalReceived("Shutdown event set, cancelling process loop.")
+        # raise asyncio.CancelledError("Shutdown event set, cancelling process loop.")
 
     async def _process_synchronous_requests_loop(
             self,

From 1bc8f9aec84fb5b84fd0970d0747a109fcaa5646 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Wed, 4 Jun 2025 13:53:57 +0300
Subject: [PATCH 30/37] WIP - Minor fixes

---
 src/guidellm/scheduler/scheduler.py |  2 +-
 src/guidellm/scheduler/worker.py    | 16 ++++------------
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 628272c1..46396fde 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -218,7 +218,7 @@ def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool:
         if run_info.max_error_rate is None:
             return False
         current_error_rate = run_info.errored_requests / run_info.end_number
-        logger.info(
+        logger.debug(
             f"Current error rate {current_error_rate} "
             f"i.e total_finished [success / error] / max total possible"
         )
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index 1b82c335..ce875409 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -145,15 +145,7 @@ def _get_queue_intermittently():
                     if shutdown_event.is_set():
                         logger.info(f"Shutdown signal received in future {process_id}")
                         raise asyncio.CancelledError()
-                        # return None
-
-        try:
-            return await asyncio.to_thread(_get_queue_intermittently)  # type: ignore[attr-defined]
-        except asyncio.CancelledError:
-            logger.info("kaki")
-            # return None
-            raise
-            # raise
+        return await asyncio.to_thread(_get_queue_intermittently)  # type: ignore[attr-defined]
 
     async def send_result(
         self,
@@ -305,9 +297,10 @@ async def _wait_for_shutdown(
         while not shutdown_event.is_set():
             await asyncio.sleep(shutdown_poll_interval)
 
-        logger.debug("Shutdown signal received")
+        # Raising asyncio.CancelledError instead would
+        # cause the asyncio.wait above to wait
+        # forever, couldn't find a reasonable reason why
         raise ShutdownSignalReceived("Shutdown event set, cancelling process loop.")
-        # raise asyncio.CancelledError("Shutdown event set, cancelling process loop.")
 
     async def _process_synchronous_requests_loop(
             self,
@@ -352,7 +345,6 @@ async def _process_asynchronous_requests_loop(
             raise ValueError("Async worker called with max_concurrency < 1")
 
         while True:
-            logger.info("Awaiting request...")
             process_request = await self.get_request(
                 requests_queue=requests_queue,
                 shutdown_event=shutdown_event,

From 99457108fea83087dc0ba662c9ac7e8d896afe1b Mon Sep 17 00:00:00 2001
From: markvaykhansky <mark@jounce.io>
Date: Wed, 4 Jun 2025 15:42:28 +0300
Subject: [PATCH 31/37] Add shutdown check interval to settings

---
 src/guidellm/config.py              | 1 +
 src/guidellm/scheduler/scheduler.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/guidellm/config.py b/src/guidellm/config.py
index ed7e782b..f137d52b 100644
--- a/src/guidellm/config.py
+++ b/src/guidellm/config.py
@@ -113,6 +113,7 @@ class Settings(BaseSettings):
     default_async_loop_sleep: float = 10e-5
     logging: LoggingSettings = LoggingSettings()
     default_sweep_number: int = 10
+    shutdown_poll_interval_seconds: float = 10
 
     # HTTP settings
     request_follow_redirects: bool = True
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 46396fde..e5a44a66 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -275,7 +275,7 @@ async def _start_processes(
                         requests_queue,
                         responses_queue,
                         shutdown_event,
-                        timedelta(seconds=10).total_seconds(),
+                        settings.shutdown_poll_interval_seconds,
                         id_,
                         requests_limit,
                     )

From f11da242fdab41621a33cf179bbe7d67cb32ec36 Mon Sep 17 00:00:00 2001
From: markvaykhansky <mark@jounce.io>
Date: Thu, 5 Jun 2025 07:49:34 +0300
Subject: [PATCH 32/37] WIP - Support more rate types

---
 src/guidellm/config.py              |  1 +
 src/guidellm/scheduler/result.py    |  5 +++
 src/guidellm/scheduler/scheduler.py | 47 +++++++++++++++++++++--------
 3 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/src/guidellm/config.py b/src/guidellm/config.py
index f137d52b..cc79b6e3 100644
--- a/src/guidellm/config.py
+++ b/src/guidellm/config.py
@@ -114,6 +114,7 @@ class Settings(BaseSettings):
     logging: LoggingSettings = LoggingSettings()
     default_sweep_number: int = 10
     shutdown_poll_interval_seconds: float = 10
+    constant_error_check_window_size = 100
 
     # HTTP settings
     request_follow_redirects: bool = True
diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
index f899f54a..990a4138 100644
--- a/src/guidellm/scheduler/result.py
+++ b/src/guidellm/scheduler/result.py
@@ -1,3 +1,4 @@
+from collections import deque
 from typing import (
     Generic,
     Literal,
@@ -16,6 +17,8 @@
 ]
 
 
+RequestStatus = Literal["success" | "error"]
+
 class SchedulerRunInfo(StandardBaseModel):
     """
     Information about the current run of the scheduler.
@@ -55,6 +58,8 @@ class SchedulerRunInfo(StandardBaseModel):
     completed_requests: int = 0
     errored_requests: int = 0
 
+    last_requests_statuses: Optional[deque[RequestStatus]] = None
+
 
 class SchedulerRequestInfo(StandardBaseModel):
     """
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index e5a44a66..4345f550 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -1,3 +1,4 @@
+import collections
 from datetime import timedelta
 import asyncio
 import math
@@ -128,10 +129,11 @@ async def run(
                 responses_queue,
                 shutdown_event,
             ) = await self._start_processes(
-                manager, executor, scheduling_strategy, max_error_rate is not None
+                manager, executor, scheduling_strategy
             )
-            if shutdown_event and shutdown_event.is_set():
+            if shutdown_event.is_set():
                 raise RuntimeError("shutdown_event is set before starting scheduling")
+
             run_info, requests_iter, times_iter = self._run_setup(
                 futures, scheduling_strategy, max_number, max_duration, max_error_rate
             )
@@ -217,27 +219,42 @@ def _validate_scheduler_params(
     def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool:
         if run_info.max_error_rate is None:
             return False
-        current_error_rate = run_info.errored_requests / run_info.end_number
-        logger.debug(
-            f"Current error rate {current_error_rate} "
-            f"i.e total_finished [success / error] / max total possible"
-        )
-        return run_info.max_error_rate < current_error_rate
+
+        is_max_error_rate = run_info.max_error_rate < 1
+        if not is_max_error_rate:
+            # Constant value
+            raise NotImplementedError()
+        if(
+            run_info.strategy.type_ == "constant"
+            and run_info.end_number != math.inf
+        ):
+            # We know how many requests
+            current_error_rate = run_info.errored_requests / run_info.end_number
+            logger.debug(
+                f"Current error rate {current_error_rate} "
+                f"i.e total_finished [success / error] / max total possible"
+            )
+            return run_info.max_error_rate < current_error_rate
+        elif settings.constant_error_check_window_size <= run_info.completed_requests:
+            # Calculate deque ratio or success to erorr
+            if run_info.last_requests_statuses is None:
+                raise RuntimeError("")
+            return
+        return False
 
     async def _start_processes(
         self,
         manager,
         executor: ProcessPoolExecutor,
         scheduling_strategy: SchedulingStrategy,
-        create_shutdown_event: bool = False,
     ) -> tuple[
         list[asyncio.Future],
         multiprocessing.Queue,
         multiprocessing.Queue,
-        Optional[MultiprocessingEvent],
+        MultiprocessingEvent,
     ]:
         await self.worker.prepare_multiprocessing()
-        shutdown_event = manager.Event() if create_shutdown_event else None
+        shutdown_event = manager.Event()
         requests_queue = manager.Queue(
             maxsize=scheduling_strategy.queued_requests_limit
         )
@@ -325,6 +342,7 @@ def _run_setup(
             processes=len(processes),
             strategy=scheduling_strategy,
             max_error_rate=max_error_rate,
+            last_requests_statuses = collections.deque(maxlen=settings.constant_error_check_window_size) if max_error_rate > 1 else None
         )
 
         return info, requests_iter, times_iter
@@ -437,9 +455,14 @@ def _check_result_ready(
             run_info.processing_requests -= 1
             run_info.completed_requests += 1
 
-            if process_response.info.errored:
+            is_errored = process_response.info.errored
+            if is_errored:
                 run_info.errored_requests += 1
 
+            if run_info.last_requests_statuses:
+                status = "error" if is_errored else "success"
+                run_info.last_requests_statuses.append(status)
+
             return SchedulerRequestResult(
                 type_="request_complete",
                 run_info=run_info,

From 6c6c15ac89d5c5c566627ba2d1d7d5aff0533858 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Thu, 5 Jun 2025 09:24:12 +0300
Subject: [PATCH 33/37] Support more rate type as well as const error count
 value

---
 src/guidellm/benchmark/benchmarker.py |  1 -
 src/guidellm/config.py                |  2 +-
 src/guidellm/scheduler/result.py      |  3 +-
 src/guidellm/scheduler/scheduler.py   | 67 ++++++++++++++++-----------
 4 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py
index ecb721f7..dddcadb5 100644
--- a/src/guidellm/benchmark/benchmarker.py
+++ b/src/guidellm/benchmark/benchmarker.py
@@ -77,7 +77,6 @@ class BenchmarkerStrategyLimits(StandardBaseModel):
     max_error_rate: Optional[float] = Field(
         description="Maximum error rate after which a benchmark will stop",
         ge=0,
-        le=1,
     )
     warmup_percent_per_strategy: Optional[float] = Field(
         description="Percentage of requests to use for warmup.",
diff --git a/src/guidellm/config.py b/src/guidellm/config.py
index cc79b6e3..b5b993d3 100644
--- a/src/guidellm/config.py
+++ b/src/guidellm/config.py
@@ -114,7 +114,7 @@ class Settings(BaseSettings):
     logging: LoggingSettings = LoggingSettings()
     default_sweep_number: int = 10
     shutdown_poll_interval_seconds: float = 10
-    constant_error_check_window_size = 100
+    error_check_window_size: int = 10
 
     # HTTP settings
     request_follow_redirects: bool = True
diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
index 990a4138..4bf15971 100644
--- a/src/guidellm/scheduler/result.py
+++ b/src/guidellm/scheduler/result.py
@@ -17,7 +17,8 @@
 ]
 
 
-RequestStatus = Literal["success" | "error"]
+RequestStatus = Literal["success", "error"]
+
 
 class SchedulerRunInfo(StandardBaseModel):
     """
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 4345f550..f6129d14 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -12,7 +12,7 @@
     Any,
     Generic,
     Optional,
-    Union,
+    Union, Literal, cast,
 )
 
 from loguru import logger
@@ -213,33 +213,48 @@ def _validate_scheduler_params(
             raise ValueError(f"Invalid max_number: {max_number}")
         if max_duration is not None and max_duration < 0:
             raise ValueError(f"Invalid max_duration: {max_duration}")
-        if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1):
+        if max_error_rate is not None and (max_error_rate < 0):
             raise ValueError(f"Invalid max_error_rate: {max_error_rate}")
 
     def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool:
-        if run_info.max_error_rate is None:
+        max_error = run_info.max_error_rate
+        if max_error is None:
             return False
 
-        is_max_error_rate = run_info.max_error_rate < 1
-        if not is_max_error_rate:
-            # Constant value
-            raise NotImplementedError()
-        if(
+        if not max_error < 1:
+            # Absolute error count, i.e not a ratio
+            logger.debug(
+                f"Current error count "
+                f"{run_info.errored_requests} / "
+                f"{max_error} (max error)"
+            )
+            return max_error < run_info.errored_requests
+        elif(
             run_info.strategy.type_ == "constant"
             and run_info.end_number != math.inf
         ):
-            # We know how many requests
-            current_error_rate = run_info.errored_requests / run_info.end_number
+            current_error_ratio = run_info.errored_requests / run_info.end_number
             logger.debug(
-                f"Current error rate {current_error_rate} "
+                f"Current error rate {current_error_ratio} "
                 f"i.e total_finished [success / error] / max total possible"
             )
-            return run_info.max_error_rate < current_error_rate
-        elif settings.constant_error_check_window_size <= run_info.completed_requests:
-            # Calculate deque ratio or success to erorr
-            if run_info.last_requests_statuses is None:
-                raise RuntimeError("")
-            return
+            return max_error < current_error_ratio
+        elif settings.error_check_window_size <= run_info.completed_requests:
+            last_requests_statuses = run_info.last_requests_statuses
+            last_errored_requests_count = len([
+                s
+                for s
+                in last_requests_statuses
+                if s == "error"
+            ])
+            current_error_ratio = last_errored_requests_count / len(last_requests_statuses)
+            logger.debug(
+                f"Current error rate in "
+                f"last requests window is "
+                f"{current_error_ratio} / {max_error} "
+                f"(max error rate)"
+            )
+            return max_error < current_error_ratio
         return False
 
     async def _start_processes(
@@ -323,12 +338,6 @@ def _run_setup(
             scheduling_strategy, max_duration, max_number
         )
 
-        if end_number == math.inf and max_error_rate is not None:
-            logger.warning(
-                "max_error_rate will be ignored "
-                "because end_number can not be determined."
-            )
-
         if end_number == math.inf and end_time is None:
             logger.warning(
                 "No end number or end time set, "
@@ -342,7 +351,9 @@ def _run_setup(
             processes=len(processes),
             strategy=scheduling_strategy,
             max_error_rate=max_error_rate,
-            last_requests_statuses = collections.deque(maxlen=settings.constant_error_check_window_size) if max_error_rate > 1 else None
+            last_requests_statuses=collections.deque(
+                maxlen=settings.error_check_window_size
+            )
         )
 
         return info, requests_iter, times_iter
@@ -459,9 +470,11 @@ def _check_result_ready(
             if is_errored:
                 run_info.errored_requests += 1
 
-            if run_info.last_requests_statuses:
-                status = "error" if is_errored else "success"
-                run_info.last_requests_statuses.append(status)
+            request_status: Literal["error", "success"] = cast(
+                Literal["error", "success"],
+                "error" if is_errored else "success"
+            )
+            run_info.last_requests_statuses.append(request_status)
 
             return SchedulerRequestResult(
                 type_="request_complete",

From 039db66f8401fa1a04d7ad1cbaacd62a6a18ccc7 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Thu, 5 Jun 2025 11:54:14 +0300
Subject: [PATCH 34/37] style + type fixes

---
 src/guidellm/backend/openai.py      |  14 +--
 src/guidellm/objects/pydantic.py    |  13 ++-
 src/guidellm/scheduler/repro.py     | 138 -----------------------
 src/guidellm/scheduler/result.py    |   3 +-
 src/guidellm/scheduler/scheduler.py |  35 +++---
 src/guidellm/scheduler/worker.py    | 165 +++++++++++++++-------------
 6 files changed, 117 insertions(+), 251 deletions(-)
 delete mode 100644 src/guidellm/scheduler/repro.py

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
index 5aec53fa..5c416e67 100644
--- a/src/guidellm/backend/openai.py
+++ b/src/guidellm/backend/openai.py
@@ -93,7 +93,7 @@ def __init__(
             raise ValueError("Target URL must be provided for OpenAI HTTP backend.")
 
         if self._target.endswith("/v1") or self._target.endswith("/v1/"):
-            # backwards compatability, strip v1 off
+            # backwards compatibility, strip v1 off
             self._target = self._target[:-3]
 
         if self._target.endswith("/"):
@@ -572,12 +572,12 @@ async def _iterative_completions_request(
 
             async for line in stream.aiter_lines():
                 iter_time = time.time()
-                # logger.debug(
-                #     "{} request: {} recieved iter response line: {}",
-                #     self.__class__.__name__,
-                #     request_id,
-                #     line,
-                # )
+                logger.debug(
+                    "{} request: {} recieved iter response line: {}",
+                    self.__class__.__name__,
+                    request_id,
+                    line,
+                )
 
                 if not line or not line.strip().startswith("data:"):
                     continue
diff --git a/src/guidellm/objects/pydantic.py b/src/guidellm/objects/pydantic.py
index 92658e17..3936d690 100644
--- a/src/guidellm/objects/pydantic.py
+++ b/src/guidellm/objects/pydantic.py
@@ -1,10 +1,11 @@
 from typing import Any, Generic, TypeVar
 
-from loguru import logger
 from pydantic import BaseModel, ConfigDict, Field
 
 __all__ = ["StandardBaseModel", "StatusBreakdown"]
 
+from guidellm import logger
+
 
 class StandardBaseModel(BaseModel):
     """
@@ -21,11 +22,11 @@ class StandardBaseModel(BaseModel):
 
     def __init__(self, /, **data: Any) -> None:
         super().__init__(**data)
-        # logger.debug(
-        #     "Initialized new instance of {} with data: {}",
-        #     self.__class__.__name__,
-        #     data,
-        # )
+        logger.debug(
+            "Initialized new instance of {} with data: {}",
+            self.__class__.__name__,
+            data,
+        )
 
 
 SuccessfulT = TypeVar("SuccessfulT")
diff --git a/src/guidellm/scheduler/repro.py b/src/guidellm/scheduler/repro.py
deleted file mode 100644
index f9f76830..00000000
--- a/src/guidellm/scheduler/repro.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import asyncio
-import multiprocessing
-import time
-import logging
-import threading
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s',
-    datefmt='%H:%M:%S'
-)
-
-# A multiprocessing queue that will remain empty
-# Naming it mp_queue to distinguish from asyncio.Queue
-mp_queue = multiprocessing.Queue()
-
-
-async def get_item_from_mp_queue(q: multiprocessing.Queue, worker_id: int):
-    """
-    Coroutine that tries to get an item from a multiprocessing.Queue
-    using asyncio.to_thread.
-    """
-    logging.info(f"Worker {worker_id}: get_item_from_mp_queue: ENTERED. Awaiting asyncio.to_thread(q.get).")
-    try:
-        # This is the blocking call in a separate thread
-        item = await asyncio.to_thread(q.get)
-        # We don't expect this to be reached if the queue is empty
-        logging.info(
-            f"Worker {worker_id}: get_item_from_mp_queue: asyncio.to_thread RETURNED NORMALLY with item: {item}.")
-        return item
-    except asyncio.CancelledError:
-        # This is where it SHOULD go if the task awaiting this coroutine is cancelled,
-        # and asyncio.to_thread correctly propagates the cancellation to its awaiter.
-        logging.error(
-            f"Worker {worker_id}: get_item_from_mp_queue: CAUGHT CancelledError from asyncio.to_thread directly!")
-        raise  # Re-raise to propagate the cancellation
-    except Exception as e:
-        logging.error(f"Worker {worker_id}: get_item_from_mp_queue: CAUGHT an UNEXPECTED EXCEPTION {type(e)}: {e}",
-                      exc_info=True)
-        raise
-    finally:
-        # This finally block will execute. The key is whether the CancelledError was caught above.
-        logging.info(f"Worker {worker_id}: get_item_from_mp_queue: EXITED (finally block).")
-
-
-async def worker_coroutine(worker_id: int, q: multiprocessing.Queue):
-    """
-    The main coroutine for our worker task. It will try to get an item
-    from the queue.
-    """
-    logging.info(f"Worker {worker_id}: worker_coroutine: STARTED.")
-    try:
-        logging.info(f"Worker {worker_id}: worker_coroutine: About to await get_item_from_mp_queue.")
-        # This is the await point where CancelledError should be injected
-        # if this worker_coroutine task is cancelled.
-        await get_item_from_mp_queue(q, worker_id)
-        logging.info(f"Worker {worker_id}: worker_coroutine: get_item_from_mp_queue completed (unexpectedly).")
-    except asyncio.CancelledError:
-        logging.error(f"Worker {worker_id}: worker_coroutine: SUCCESSFULLY CAUGHT CancelledError.")
-        # Perform any task-specific cleanup here if needed
-    except Exception as e:
-        logging.error(f"Worker {worker_id}: worker_coroutine: CAUGHT UNEXPECTED EXCEPTION {type(e)}: {e}",
-                      exc_info=True)
-    finally:
-        logging.info(f"Worker {worker_id}: worker_coroutine: FINISHED (finally block).")
-
-
-async def main_orchestrator():
-    """
-    Orchestrates the test: creates, runs, and cancels the worker.
-    """
-    logging.info("Main Orchestrator: Starting worker task.")
-    worker_task = asyncio.create_task(worker_coroutine(1, mp_queue), name="WorkerCoroutine-1")
-
-    # Give the worker task a moment to start and block on the queue
-    logging.info("Main Orchestrator: Sleeping for 1 second to let worker block...")
-    await asyncio.sleep(1)
-
-    logging.info(f"Main Orchestrator: Current active threads: {[t.name for t_ in threading.enumerate()]}...")
-
-    # Cancel the worker task
-    print("Main Orchestrator: Cancelling worker_task...")
-    worker_task.cancel()
-
-    # Wait for the worker task to finish, with a timeout.
-    # If cancellation works as expected, worker_task should complete (by handling CancelledError)
-    # well before the timeout.
-    # If it gets stuck, asyncio.TimeoutError will be raised.
-    timeout_seconds = 5.0
-    logging.info(f"Main Orchestrator: Awaiting worker_task with timeout {timeout_seconds}s...")
-    try:
-        await asyncio.wait_for(worker_task, timeout=timeout_seconds)
-        logging.info("Main Orchestrator: worker_task completed WITHOUT timeout.")
-    except asyncio.TimeoutError:
-        logging.error(
-            f"Main Orchestrator: TIMEOUT! worker_task did not finish within {timeout_seconds}s after cancellation.")
-        logging.error(
-            f"Main Orchestrator: worker_task.done() = {worker_task.done()}, worker_task.cancelled() = {worker_task.cancelled()}")
-        # At this point, the thread running mp_queue.get() is likely still blocked.
-    except asyncio.CancelledError:
-        # This would happen if main_orchestrator itself was cancelled, not expected here.
-        logging.error("Main Orchestrator: main_orchestrator itself was cancelled (unexpected).")
-    except Exception as e:
-        logging.error(f"Main Orchestrator: An unexpected error occurred while waiting for worker_task: {e}",
-                      exc_info=True)
-    finally:
-        logging.info("Main Orchestrator: Test finished.")
-        # Note: The thread started by asyncio.to_thread for mp_queue.get()
-        # might still be alive and blocked if q.get() wasn't unblocked.
-        # It's a daemon thread by default, so it won't prevent program exit.
-        # To clean it up, one would typically put a sentinel into mp_queue.
-        # For this test, we are focused on the asyncio task cancellation.
-        logging.info(
-            f"Main Orchestrator: Final check: worker_task.done() = {worker_task.done()}, worker_task.cancelled() = {worker_task.cancelled()}")
-
-        # Attempt to unblock the queue to allow the thread to exit,
-        # though the test's focus is on the asyncio cancellation.
-        try:
-            mp_queue.put_nowait(None)  # Sentinel
-            logging.info("Main Orchestrator: Put sentinel in mp_queue to unblock thread.")
-        except Exception:
-            logging.warning("Main Orchestrator: Could not put sentinel in mp_queue.")
-
-
-if __name__ == "__main__":
-    # For multiprocessing queues to work correctly, especially on Windows/macOS
-    # with 'spawn' or 'forkserver' start methods, it's good practice
-    # to ensure the queue is created in the main process scope before tasks.
-    # In this simple script, it's fine.
-    try:
-        asyncio.run(main_orchestrator())
-    except KeyboardInterrupt:
-        logging.info("Main Orchestrator: Keyboard interrupt received.")
-    finally:
-        mp_queue.close()
-        mp_queue.join_thread()  # Ensure queue's feeder thread is joined
-        logging.info("Main Orchestrator: mp_queue resources released.")
diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
index 4bf15971..5cbf2f7e 100644
--- a/src/guidellm/scheduler/result.py
+++ b/src/guidellm/scheduler/result.py
@@ -50,6 +50,7 @@ class SchedulerRunInfo(StandardBaseModel):
     end_number: float
     processes: int
     strategy: SchedulingStrategy
+    last_requests_statuses: deque[RequestStatus]
     max_error_rate: Optional[float] = None
 
     created_requests: int = 0
@@ -59,8 +60,6 @@ class SchedulerRunInfo(StandardBaseModel):
     completed_requests: int = 0
     errored_requests: int = 0
 
-    last_requests_statuses: Optional[deque[RequestStatus]] = None
-
 
 class SchedulerRequestInfo(StandardBaseModel):
     """
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index f6129d14..4e5bca3a 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -1,6 +1,5 @@
-import collections
-from datetime import timedelta
 import asyncio
+import collections
 import math
 import multiprocessing
 import multiprocessing.queues
@@ -11,8 +10,10 @@
 from typing import (
     Any,
     Generic,
+    Literal,
     Optional,
-    Union, Literal, cast,
+    Union,
+    cast,
 )
 
 from loguru import logger
@@ -128,9 +129,7 @@ async def run(
                 requests_queue,
                 responses_queue,
                 shutdown_event,
-            ) = await self._start_processes(
-                manager, executor, scheduling_strategy
-            )
+            ) = await self._start_processes(manager, executor, scheduling_strategy)
             if shutdown_event.is_set():
                 raise RuntimeError("shutdown_event is set before starting scheduling")
 
@@ -156,7 +155,6 @@ async def run(
                     ):
                         # we've exhausted all requests we've wanted to run
                         # and yielded all responses
-                        logger.info("run_info.completed_requests >= run_info.created_requests")
                         break
 
                     requests_iter = self._add_requests(
@@ -229,10 +227,7 @@ def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool:
                 f"{max_error} (max error)"
             )
             return max_error < run_info.errored_requests
-        elif(
-            run_info.strategy.type_ == "constant"
-            and run_info.end_number != math.inf
-        ):
+        elif run_info.strategy.type_ == "constant" and run_info.end_number != math.inf:
             current_error_ratio = run_info.errored_requests / run_info.end_number
             logger.debug(
                 f"Current error rate {current_error_ratio} "
@@ -241,13 +236,12 @@ def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool:
             return max_error < current_error_ratio
         elif settings.error_check_window_size <= run_info.completed_requests:
             last_requests_statuses = run_info.last_requests_statuses
-            last_errored_requests_count = len([
-                s
-                for s
-                in last_requests_statuses
-                if s == "error"
-            ])
-            current_error_ratio = last_errored_requests_count / len(last_requests_statuses)
+            last_errored_requests_count = len(
+                [s for s in last_requests_statuses if s == "error"]
+            )
+            current_error_ratio = last_errored_requests_count / len(
+                last_requests_statuses
+            )
             logger.debug(
                 f"Current error rate in "
                 f"last requests window is "
@@ -353,7 +347,7 @@ def _run_setup(
             max_error_rate=max_error_rate,
             last_requests_statuses=collections.deque(
                 maxlen=settings.error_check_window_size
-            )
+            ),
         )
 
         return info, requests_iter, times_iter
@@ -471,8 +465,7 @@ def _check_result_ready(
                 run_info.errored_requests += 1
 
             request_status: Literal["error", "success"] = cast(
-                Literal["error", "success"],
-                "error" if is_errored else "success"
+                "Literal['error', 'success']", "error" if is_errored else "success"
             )
             run_info.last_requests_statuses.append(request_status)
 
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index ce875409..784d4c21 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -2,12 +2,14 @@
 import math
 import multiprocessing.queues
 import queue
+import threading
 import time
+import typing
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
-from datetime import timedelta
 from multiprocessing.synchronize import Event as MultiprocessingEvent
+from threading import Event
 from typing import (
     Any,
     Generic,
@@ -42,7 +44,7 @@
 ]
 
 
-class ShutdownSignalReceived(Exception):
+class ShutdownSignalReceivedError(Exception):
     pass
 
 
@@ -127,11 +129,12 @@ async def resolve(
         ...
 
     async def get_request(
-            self, requests_queue: multiprocessing.Queue,
-            shutdown_event: MultiprocessingEvent,
-            process_id: int,
-            shutdown_poll_interval_seconds: float,
-    ) -> Optional[WorkerProcessRequest[RequestT]]:
+        self,
+        requests_queue: multiprocessing.Queue,
+        shutdown_event: threading.Event,
+        process_id: int,
+        shutdown_poll_interval_seconds: float,
+    ) -> WorkerProcessRequest[RequestT]:
         # We need to check shutdown_event intermittently cause
         # if we simply use asyncio.to_thread(requests_queue.get)
         # the cancellation task doesn't propagate because the
@@ -140,11 +143,12 @@ def _get_queue_intermittently():
             while True:
                 try:
                     return requests_queue.get(timeout=shutdown_poll_interval_seconds)
-                except queue.Empty:
+                except queue.Empty as e:
                     logger.info("Checking shutdown even is set in get_request")
                     if shutdown_event.is_set():
                         logger.info(f"Shutdown signal received in future {process_id}")
-                        raise asyncio.CancelledError()
+                        raise asyncio.CancelledError from e
+
         return await asyncio.to_thread(_get_queue_intermittently)  # type: ignore[attr-defined]
 
     async def send_result(
@@ -212,45 +216,56 @@ async def resolve_scheduler_request(
         asyncio.create_task(self.send_result(results_queue, result))
 
     def run_process(
-            self,
-            type_: Literal["sync", "async"],
-            requests_queue: multiprocessing.Queue,
-            results_queue: multiprocessing.Queue,
-            shutdown_event: multiprocessing.Event,
-            shutdown_poll_interval_seconds: float,
-            process_id: int,
-            max_concurrency: Optional[int] = None,
+        self,
+        type_: Literal["sync", "async"],
+        requests_queue: multiprocessing.Queue,
+        results_queue: multiprocessing.Queue,
+        shutdown_event: MultiprocessingEvent,
+        shutdown_poll_interval_seconds: float,
+        process_id: int,
+        max_concurrency: Optional[int] = None,
     ):
         async def _process_runner():
-            import threading
-            internal_shutdown_event = threading.Event()
+            # We are using a separate internal event
+            # because if we're using the shutdown_event
+            # there's a race condition between the get_request
+            # loop which checks for shutdown and the .cancel() in this
+            # method which causes the asyncio.CancelledError
+            # to propagate and crash the worker
+            internal_shutdown_event: threading.Event = Event()
             if type_ == "sync":
-                loop_task = asyncio.create_task(self._process_synchronous_requests_loop(
-                    requests_queue=requests_queue,
-                    results_queue=results_queue,
-                    process_id=process_id,
-                    shutdown_event=internal_shutdown_event,
-                    shutdown_poll_interval_seconds=shutdown_poll_interval_seconds,
-                ), name="request_loop_processor_task")
+                loop_task = asyncio.create_task(
+                    self._process_synchronous_requests_loop(
+                        requests_queue=requests_queue,
+                        results_queue=results_queue,
+                        process_id=process_id,
+                        shutdown_event=internal_shutdown_event,
+                        shutdown_poll_interval_seconds=shutdown_poll_interval_seconds,
+                    ),
+                    name="request_loop_processor_task",
+                )
             elif type_ == "async":
                 if max_concurrency is None:
-                    raise ValueError("max_concurrency must be set "
-                                     "for async processor")
-                loop_task = asyncio.create_task(self._process_asynchronous_requests_loop(
-                    requests_queue=requests_queue,
-                    results_queue=results_queue,
-                    max_concurrency=max_concurrency,
-                    process_id=process_id,
-                    shutdown_event=internal_shutdown_event,
-                    shutdown_poll_interval_seconds=shutdown_poll_interval_seconds,
-                ), name="request_loop_processor_task")
+                    raise ValueError("max_concurrency must be set for async processor")
+                loop_task = asyncio.create_task(
+                    self._process_asynchronous_requests_loop(
+                        requests_queue=requests_queue,
+                        results_queue=results_queue,
+                        max_concurrency=max_concurrency,
+                        process_id=process_id,
+                        shutdown_event=internal_shutdown_event,
+                        shutdown_poll_interval_seconds=shutdown_poll_interval_seconds,
+                    ),
+                    name="request_loop_processor_task",
+                )
             else:
                 raise ValueError(f"Invalid process type: {type_}")
 
             shutdown_task = asyncio.create_task(
                 self._wait_for_shutdown(
                     shutdown_event=shutdown_event,
-                    shutdown_poll_interval=shutdown_poll_interval_seconds
+                    shutdown_poll_interval=shutdown_poll_interval_seconds,
+                    process_id=process_id,
                 ),
                 name="shutdown_task",
             )
@@ -262,22 +277,26 @@ async def _process_runner():
                 ],
                 return_when=asyncio.FIRST_EXCEPTION,
             )
-            logger.info(f"First exception happened, done: [{[r.get_name() for r in done]}")
+            logger.info(
+                f"First exception happened, done: [{[r.get_name() for r in done]}"
+            )
 
             for task in pending:
-                logger.debug(f"Cancelling task {task.get_name()}")
-                cancel_result = task.cancel()
+                logger.debug(
+                    f"Cancelling task {task.get_name()}|| Process {process_id}"
+                )
+                task.cancel()
                 internal_shutdown_event.set()
-                logger.debug(f"{'Task is already done or canceled' if not cancel_result else 'sent cancel signal'}")
-                try:
+                try:  # noqa: SIM105
                     await task
                 except asyncio.CancelledError:
                     pass
 
             for task in done:
-                task_exception = task.exception()
-                if not isinstance(task_exception, ShutdownSignalReceived):
+                task_exception = typing.cast("Exception", task.exception())
+                if not isinstance(task_exception, ShutdownSignalReceivedError):
                     raise task_exception
+
         try:
             asyncio.run(_process_runner())
         except Exception as exc:  # noqa: BLE001
@@ -290,32 +309,35 @@ async def _process_runner():
             shutdown_event.set()  # ensure shutdown event is set to stop other processes
 
     async def _wait_for_shutdown(
-            self,
-            shutdown_event: MultiprocessingEvent,
-            shutdown_poll_interval: float,
+        self,
+        shutdown_event: MultiprocessingEvent,
+        shutdown_poll_interval: float,
+        process_id: int,
     ):
-        while not shutdown_event.is_set():
+        while not shutdown_event.is_set():  # noqa: ASYNC110
             await asyncio.sleep(shutdown_poll_interval)
 
         # Raising asyncio.CancelledError instead would
         # cause the asyncio.wait above to wait
         # forever, couldn't find a reasonable reason why
-        raise ShutdownSignalReceived("Shutdown event set, cancelling process loop.")
+        raise ShutdownSignalReceivedError(
+            f"Shutdown event set for process {process_id}, cancelling process loop."
+        )
 
     async def _process_synchronous_requests_loop(
-            self,
-            requests_queue: multiprocessing.Queue,
-            results_queue: multiprocessing.Queue,
-            process_id: int,
-            shutdown_event: MultiprocessingEvent,
-            shutdown_poll_interval_seconds: float,
+        self,
+        requests_queue: multiprocessing.Queue,
+        results_queue: multiprocessing.Queue,
+        process_id: int,
+        shutdown_event: threading.Event,
+        shutdown_poll_interval_seconds: float,
     ):
         while True:
             process_request = await self.get_request(
                 requests_queue=requests_queue,
                 shutdown_event=shutdown_event,
                 process_id=process_id,
-                shutdown_poll_interval_seconds=shutdown_poll_interval_seconds
+                shutdown_poll_interval_seconds=shutdown_poll_interval_seconds,
             )
 
             dequeued_time = time.time()
@@ -336,7 +358,7 @@ async def _process_asynchronous_requests_loop(
         results_queue: multiprocessing.Queue,
         max_concurrency: int,
         process_id: int,
-        shutdown_event: MultiprocessingEvent,
+        shutdown_event: threading.Event,
         shutdown_poll_interval_seconds: float,
     ):
         pending = asyncio.Semaphore(max_concurrency)
@@ -431,36 +453,25 @@ async def prepare_multiprocessing(self):
         """
         await self.backend.prepare_multiprocessing()
 
-    def process_loop_synchronous(
+    def run_process(
         self,
+        type_: Literal["sync", "async"],
         requests_queue: multiprocessing.Queue,
         results_queue: multiprocessing.Queue,
+        shutdown_event: MultiprocessingEvent,
+        shutdown_poll_interval_seconds: float,
         process_id: int,
-        shutdown_event: Optional[MultiprocessingEvent] = None,
+        max_concurrency: Optional[int] = None,
     ):
         asyncio.run(self.backend.validate())
-        super().process_loop_synchronous(
+        super().run_process(
+            type_=type_,
             requests_queue=requests_queue,
             results_queue=results_queue,
-            process_id=process_id,
             shutdown_event=shutdown_event,
-        )
-
-    def process_loop_asynchronous(
-        self,
-        requests_queue: multiprocessing.Queue,
-        results_queue: multiprocessing.Queue,
-        max_concurrency: int,
-        process_id: int,
-        shutdown_event: Optional[MultiprocessingEvent] = None,
-    ):
-        asyncio.run(self.backend.validate())
-        super().process_loop_asynchronous(
-            requests_queue=requests_queue,
-            results_queue=results_queue,
-            max_concurrency=max_concurrency,
+            shutdown_poll_interval_seconds=shutdown_poll_interval_seconds,
             process_id=process_id,
-            shutdown_event=shutdown_event,
+            max_concurrency=max_concurrency,
         )
 
     async def resolve(

From 640744c4c8ee68b830bbe64fe0735a500a06bbb2 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Thu, 5 Jun 2025 14:32:21 +0300
Subject: [PATCH 35/37] style + type fixes

---
 src/guidellm/__main__.py              | 21 ++++++++++-------
 src/guidellm/benchmark/benchmark.py   |  6 ++---
 src/guidellm/benchmark/benchmarker.py | 14 ++++++-----
 src/guidellm/benchmark/entrypoints.py |  4 ++--
 src/guidellm/benchmark/output.py      |  2 +-
 src/guidellm/scheduler/result.py      |  2 +-
 src/guidellm/scheduler/scheduler.py   | 34 +++++++++++++--------------
 7 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index 48ccaeed..a363804b 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -164,14 +164,19 @@ def cli():
     ),
 )
 @click.option(
-    "--max-error-rate",
+    "--max-error",
     type=float,
     help=(
-        "The maximum error rate after which a benchmark will stop. "
-        "Applicable only for finite deterministic scenarios i.e "
-        "rate_type is 'constant' and 'max_seconds' exists OR "
-        "'max_requests' exists OR the dataset is finite. "
-        "If None or not applicable, benchmarks will continue regardless of error rate."
+        "The maximum error after which a benchmark will stop. "
+        "Can either be a rate i.e 0 < rate < 1 or constant number. "
+        "If rate is given and rate_type is 'constant' and 'max_seconds' exists "
+        "then the rate will be calculated as part of the total expected "
+        "requests count i.e rate * duration. If rate is given and number"
+        "of requests is not pre-determined than a context window "
+        "of the last requests will be looked at. Context window size"
+        "is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE."
+        "If a number above 1 is given than we just count the total"
+        "number of error and check if it's above the threshold."
     ),
 )
 @click.option(
@@ -253,7 +258,7 @@ def benchmark(
     rate,
     max_seconds,
     max_requests,
-    max_error_rate,
+    max_error,
     warmup_percent,
     cooldown_percent,
     disable_progress,
@@ -279,7 +284,7 @@ def benchmark(
             rate=rate,
             max_seconds=max_seconds,
             max_requests=max_requests,
-            max_error_rate=max_error_rate,
+            max_error=max_error,
             warmup_percent=warmup_percent,
             cooldown_percent=cooldown_percent,
             show_progress=not disable_progress,
diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
index d33e6a56..c2d8c011 100644
--- a/src/guidellm/benchmark/benchmark.py
+++ b/src/guidellm/benchmark/benchmark.py
@@ -90,8 +90,8 @@ class BenchmarkArgs(StandardBaseModel):
     max_duration: Optional[float] = Field(
         description="The maximum duration in seconds to run this benchmark, if any."
     )
-    max_error_rate: Optional[float] = Field(
-        description="Maximum error rate after which a benchmark will stop."
+    max_error: Optional[float] = Field(
+        description="Maximum error rate or const after which a benchmark will stop."
     )
     warmup_number: Optional[int] = Field(
         description=(
@@ -220,7 +220,7 @@ class BenchmarkRunStats(StandardBaseModel):
         description=(
             "The number of errored requests divided by the number "
             "of successful and errored requests. "
-            "This can be higher than max_error_rate "
+            "This can be higher than max_error "
             "(if applicable) cause it does not take into "
             "account incomplete requests."
         )
diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py
index dddcadb5..7a9f41ee 100644
--- a/src/guidellm/benchmark/benchmarker.py
+++ b/src/guidellm/benchmark/benchmarker.py
@@ -74,8 +74,10 @@ class BenchmarkerStrategyLimits(StandardBaseModel):
         description="Maximum duration (in seconds) to process requests per strategy.",
         ge=0,
     )
-    max_error_rate: Optional[float] = Field(
-        description="Maximum error rate after which a benchmark will stop",
+    max_error: Optional[float] = Field(
+        description="Maximum error after which a "
+        "benchmark will stop,"
+        " either rate or fixed number",
         ge=0,
     )
     warmup_percent_per_strategy: Optional[float] = Field(
@@ -152,7 +154,7 @@ async def run(
         profile: Profile,
         max_number_per_strategy: Optional[int],
         max_duration_per_strategy: Optional[float],
-        max_error_rate: Optional[float],
+        max_error: Optional[float],
         warmup_percent_per_strategy: Optional[float],
         cooldown_percent_per_strategy: Optional[float],
     ) -> AsyncGenerator[
@@ -167,7 +169,7 @@ async def run(
             requests_loader_size=requests_loader_size,
             max_number_per_strategy=max_number_per_strategy,
             max_duration_per_strategy=max_duration_per_strategy,
-            max_error_rate=max_error_rate,
+            max_error=max_error,
             warmup_percent_per_strategy=warmup_percent_per_strategy,
             cooldown_percent_per_strategy=cooldown_percent_per_strategy,
         )
@@ -202,7 +204,7 @@ async def run(
                 scheduling_strategy=scheduling_strategy,
                 max_number=max_number_per_strategy,
                 max_duration=max_duration_per_strategy,
-                max_error_rate=max_error_rate,
+                max_error=max_error,
             ):
                 if result.type_ == "run_start":
                     yield BenchmarkerResult(
@@ -328,7 +330,7 @@ def create_benchmark_aggregator(
                 strategy=strategy,
                 max_number=limits.max_number,
                 max_duration=limits.max_duration,
-                max_error_rate=limits.max_error_rate,
+                max_error=limits.max_error,
                 warmup_number=limits.warmup_number,
                 warmup_duration=limits.warmup_duration,
                 cooldown_number=limits.cooldown_number,
diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
index a5e4da3b..e70ae0a6 100644
--- a/src/guidellm/benchmark/entrypoints.py
+++ b/src/guidellm/benchmark/entrypoints.py
@@ -41,7 +41,7 @@ async def benchmark_generative_text(
     rate: Optional[Union[float, list[float]]],
     max_seconds: Optional[float],
     max_requests: Optional[int],
-    max_error_rate: Optional[float],
+    max_error: Optional[float],
     warmup_percent: Optional[float],
     cooldown_percent: Optional[float],
     show_progress: bool,
@@ -108,7 +108,7 @@ async def benchmark_generative_text(
         profile=profile,
         max_number_per_strategy=max_requests,
         max_duration_per_strategy=max_seconds,
-        max_error_rate=max_error_rate,
+        max_error=max_error,
         warmup_percent_per_strategy=warmup_percent,
         cooldown_percent_per_strategy=cooldown_percent,
     ):
diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py
index 33b1efc2..ac32bc4f 100644
--- a/src/guidellm/benchmark/output.py
+++ b/src/guidellm/benchmark/output.py
@@ -419,7 +419,7 @@ def benchmarks_args_str(self) -> str:
             {
                 "max_number": args.max_number,
                 "max_duration": args.max_duration,
-                "max_error_rate": args.max_error_rate,
+                "max_error": args.max_error,
                 "warmup_number": args.warmup_number,
                 "warmup_duration": args.warmup_duration,
                 "cooldown_number": args.cooldown_number,
diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
index 5cbf2f7e..4f4d5c87 100644
--- a/src/guidellm/scheduler/result.py
+++ b/src/guidellm/scheduler/result.py
@@ -51,7 +51,7 @@ class SchedulerRunInfo(StandardBaseModel):
     processes: int
     strategy: SchedulingStrategy
     last_requests_statuses: deque[RequestStatus]
-    max_error_rate: Optional[float] = None
+    max_error: Optional[float] = None
 
     created_requests: int = 0
     queued_requests: int = 0
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 4e5bca3a..e84c2320 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -76,7 +76,7 @@ async def run(
         scheduling_strategy: SchedulingStrategy,
         max_number: Optional[int] = None,
         max_duration: Optional[float] = None,
-        max_error_rate: Optional[float] = None,
+        max_error: Optional[float] = None,
     ) -> AsyncGenerator[
         Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None
     ]:
@@ -105,8 +105,8 @@ async def run(
         :param max_duration: The maximum duration for the scheduling run.
             If None, then no limit is set and either the iterator must be exhaustible
             or the max_number must be set.
-        :param max_error_rate: The maximum error rate after which the
-            scheduler shuts down.
+        :param max_error: The maximum error rate or const
+            after which the scheduler shuts down.
             Only applicable in benchmarks with finite deterministic number of requests.
             If None or not applicable then scheduler will continue regardless of errors.
         :return: An asynchronous generator that yields SchedulerResult objects.
@@ -114,7 +114,7 @@ async def run(
             the response, and the run information.
         """
         self._validate_scheduler_params(
-            scheduling_strategy, max_duration, max_error_rate, max_number
+            scheduling_strategy, max_duration, max_error, max_number
         )
 
         with (
@@ -134,7 +134,7 @@ async def run(
                 raise RuntimeError("shutdown_event is set before starting scheduling")
 
             run_info, requests_iter, times_iter = self._run_setup(
-                futures, scheduling_strategy, max_number, max_duration, max_error_rate
+                futures, scheduling_strategy, max_number, max_duration, max_error
             )
             yield SchedulerResult(
                 type_="run_start",
@@ -142,8 +142,8 @@ async def run(
             )
 
             try:
-                max_error_rate_reached = False
-                while not max_error_rate_reached:
+                max_error_reached = False
+                while not max_error_reached:
                     # check errors and raise them
                     for future in futures:
                         if future.done() and (err := future.exception()) is not None:
@@ -173,13 +173,13 @@ async def run(
                         if (
                             iter_result.request_info.errored
                             and not iter_result.request_info.canceled
-                            and self._is_max_error_rate_reached(iter_result.run_info)
+                            and self._is_max_error_reached(iter_result.run_info)
                         ):
                             shutdown_event.set()
-                            max_error_rate_reached = True
+                            max_error_reached = True
                             logger.info(
                                 f"Max error rate of "
-                                f"({iter_result.run_info.max_error_rate}) "
+                                f"({iter_result.run_info.max_error}) "
                                 f"reached, sending shutdown signal"
                             )
                         yield iter_result
@@ -200,7 +200,7 @@ def _validate_scheduler_params(
         self,
         scheduling_strategy: SchedulingStrategy,
         max_duration: Optional[float],
-        max_error_rate: Optional[float],
+        max_error: Optional[float],
         max_number: Optional[int],
     ) -> None:
         if scheduling_strategy is None or not isinstance(
@@ -211,11 +211,11 @@ def _validate_scheduler_params(
             raise ValueError(f"Invalid max_number: {max_number}")
         if max_duration is not None and max_duration < 0:
             raise ValueError(f"Invalid max_duration: {max_duration}")
-        if max_error_rate is not None and (max_error_rate < 0):
-            raise ValueError(f"Invalid max_error_rate: {max_error_rate}")
+        if max_error is not None and (max_error < 0):
+            raise ValueError(f"Invalid max_error: {max_error}")
 
-    def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool:
-        max_error = run_info.max_error_rate
+    def _is_max_error_reached(self, run_info: SchedulerRunInfo) -> bool:
+        max_error = run_info.max_error
         if max_error is None:
             return False
 
@@ -322,7 +322,7 @@ def _run_setup(
         scheduling_strategy: SchedulingStrategy,
         max_number: Optional[int],
         max_duration: Optional[float],
-        max_error_rate: Optional[float],
+        max_error: Optional[float],
     ) -> tuple[SchedulerRunInfo, Iterator[Any], Iterator[float]]:
         requests_iter = iter(self.request_loader)
         start_time = time.time()
@@ -344,7 +344,7 @@ def _run_setup(
             end_number=end_number,
             processes=len(processes),
             strategy=scheduling_strategy,
-            max_error_rate=max_error_rate,
+            max_error=max_error,
             last_requests_statuses=collections.deque(
                 maxlen=settings.error_check_window_size
             ),

From 5783d629efcf7591f48136472965f89f6478a1b2 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Thu, 5 Jun 2025 14:32:34 +0300
Subject: [PATCH 36/37] fix tests

---
 tests/unit/benchmark/test_output.py | 2 +-
 tests/unit/mock_benchmark.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/benchmark/test_output.py b/tests/unit/benchmark/test_output.py
index e3114491..de32b44b 100644
--- a/tests/unit/benchmark/test_output.py
+++ b/tests/unit/benchmark/test_output.py
@@ -113,7 +113,7 @@ def test_console_benchmarks_args_str():
     mock_benchmark = mock_generative_benchmark()
     console.benchmarks = [mock_benchmark]
     assert console.benchmarks_args_str == (
-        "max_number=None, max_duration=10.0, max_error_rate=0.05, warmup_number=None, "
+        "max_number=None, max_duration=10.0, max_error=0.05, warmup_number=None, "
         "warmup_duration=None, cooldown_number=None, cooldown_duration=None"
     )
 
diff --git a/tests/unit/mock_benchmark.py b/tests/unit/mock_benchmark.py
index 3c360c68..4a8a1f29 100644
--- a/tests/unit/mock_benchmark.py
+++ b/tests/unit/mock_benchmark.py
@@ -221,7 +221,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark:
             strategy=SynchronousStrategy(),
             max_number=None,
             max_duration=10.0,
-            max_error_rate=0.05,
+            max_error=0.05,
             warmup_number=None,
             warmup_duration=None,
             cooldown_number=None,

From 85cb24d904e844ce3c7a25a78fb8a47f9858c8f3 Mon Sep 17 00:00:00 2001
From: mark-vaykhansky <prhing@gmail.com>
Date: Thu, 5 Jun 2025 14:33:56 +0300
Subject: [PATCH 37/37] invert if

---
 src/guidellm/scheduler/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index e84c2320..31ac5c61 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -219,7 +219,7 @@ def _is_max_error_reached(self, run_info: SchedulerRunInfo) -> bool:
         if max_error is None:
             return False
 
-        if not max_error < 1:
+        if max_error >= 1:
             # Absolute error count, i.e not a ratio
             logger.debug(
                 f"Current error count "