From b7638b038ced93fe061b9e44191b50b14dfa1c39 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Sun, 18 May 2025 17:42:00 +0300 Subject: [PATCH 01/37] wip // max error rate in scheduler --- src/guidellm/scheduler/result.py | 2 ++ src/guidellm/scheduler/scheduler.py | 23 +++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py index 0f12687f..9d379422 100644 --- a/src/guidellm/scheduler/result.py +++ b/src/guidellm/scheduler/result.py @@ -46,12 +46,14 @@ class SchedulerRunInfo(StandardBaseModel): end_number: float processes: int strategy: SchedulingStrategy + max_error_rate: float created_requests: int = 0 queued_requests: int = 0 scheduled_requests: int = 0 processing_requests: int = 0 completed_requests: int = 0 + errored_requests: int = 0 class SchedulerRequestInfo(StandardBaseModel): diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 06203827..33204729 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -64,12 +64,14 @@ def __init__( self.worker = worker self.request_loader = request_loader + self.error_rate: Optional[float] = None async def run( self, scheduling_strategy: SchedulingStrategy, max_number: Optional[int] = None, max_duration: Optional[float] = None, + max_error_rate: Optional[float] = 0.05, ) -> AsyncGenerator[ Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None ]: @@ -98,6 +100,8 @@ async def run( :param max_duration: The maximum duration for the scheduling run. If None, then no limit is set and either the iterator must be exhaustible or the max_number must be set. + :param max_error_rate: The maximum error rate after which the scheduler shuts down. + If not provided a default of 5% i.e 0.05 is used. :return: An asynchronous generator that yields SchedulerResult objects. Each SchedulerResult object contains information about the request, the response, and the run information. @@ -109,9 +113,12 @@ async def run( if max_number is not None and max_number < 1: raise ValueError(f"Invalid max_number: {max_number}") - if max_duration is not None and max_duration < 0: raise ValueError(f"Invalid max_duration: {max_duration}") + if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1): + raise ValueError(f"Invalid max_error_rate: {max_error_rate}") + + shutdown_event = multiprocessing.Event() with ( multiprocessing.Manager() as manager, @@ -124,7 +131,7 @@ async def run( manager, executor, scheduling_strategy ) run_info, requests_iter, times_iter = self._run_setup( - futures, scheduling_strategy, max_number, max_duration + futures, scheduling_strategy, max_number, max_duration, max_error_rate ) yield SchedulerResult( type_="run_start", @@ -159,6 +166,8 @@ async def run( run_info, ) if iter_result is not None: + if self._is_max_error_rate_reached(iter_result.run_info): + logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!") yield iter_result # yield control to the event loop @@ -249,6 +258,7 @@ def _run_setup( scheduling_strategy: SchedulingStrategy, max_number: Optional[int], max_duration: Optional[float], + max_error_rate: Optional[float], ) -> tuple[SchedulerRunInfo, Iterator[Any], Iterator[float]]: requests_iter = iter(self.request_loader) start_time = time.time() @@ -276,6 +286,7 @@ def _run_setup( end_number=end_number, processes=len(processes), strategy=scheduling_strategy, + max_error_rate=max_error_rate ) return info, requests_iter, times_iter @@ -362,6 +373,9 @@ def _check_result_ready( run_info.processing_requests -= 1 run_info.completed_requests += 1 + if process_response.info.errored: + run_info.errored_requests += 1 + return SchedulerRequestResult( type_="request_complete", run_info=run_info, @@ -371,6 +385,11 @@ def _check_result_ready( ) raise ValueError(f"Invalid process response type: {process_response}") + @staticmethod + def _is_max_error_rate_reached(run_info: SchedulerRunInfo) -> bool: + current_error_rate = run_info.errored_requests / run_info.end_number + return current_error_rate > run_info.max_error_rate + async def _stop_processes( self, futures: list[asyncio.Future], From 6059af183ebed636af5a2a7eed4707d943f8e7db Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Mon, 19 May 2025 11:41:33 +0300 Subject: [PATCH 02/37] wip --- src/guidellm/scheduler/scheduler.py | 18 ++++++++++-------- src/guidellm/scheduler/worker.py | 9 +++++++-- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 33204729..cd9231af 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -64,7 +64,6 @@ def __init__( self.worker = worker self.request_loader = request_loader - self.error_rate: Optional[float] = None async def run( self, @@ -118,8 +117,6 @@ async def run( if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1): raise ValueError(f"Invalid max_error_rate: {max_error_rate}") - shutdown_event = multiprocessing.Event() - with ( multiprocessing.Manager() as manager, ProcessPoolExecutor( @@ -127,7 +124,7 @@ async def run( ) as executor, ): requests_iter: Optional[Iterator[Any]] = None - futures, requests_queue, responses_queue = await self._start_processes( + futures, requests_queue, responses_queue, shutdown_event = await self._start_processes( manager, executor, scheduling_strategy ) run_info, requests_iter, times_iter = self._run_setup( @@ -167,7 +164,9 @@ async def run( ) if iter_result is not None: if self._is_max_error_rate_reached(iter_result.run_info): - logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!") + logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached, sending " + f"shutdown signal") + shutdown_event.set() yield iter_result # yield control to the event loop @@ -191,8 +190,10 @@ async def _start_processes( list[asyncio.Future], multiprocessing.Queue, multiprocessing.Queue, + multiprocessing.Event ]: await self.worker.prepare_multiprocessing() + shutdown_event = multiprocessing.Event() requests_queue = manager.Queue( maxsize=scheduling_strategy.queued_requests_limit ) @@ -229,6 +230,7 @@ async def _start_processes( requests_queue, responses_queue, id_, + shutdown_event, ) ) elif scheduling_strategy.processing_mode == "async": @@ -240,6 +242,7 @@ async def _start_processes( responses_queue, requests_limit, id_, + shutdown_event, ) ) else: @@ -250,7 +253,7 @@ async def _start_processes( await asyncio.sleep(0.1) # give time for processes to start - return futures, requests_queue, responses_queue + return futures, requests_queue, responses_queue, shutdown_event def _run_setup( self, @@ -385,8 +388,7 @@ def _check_result_ready( ) raise ValueError(f"Invalid process response type: {process_response}") - @staticmethod - def _is_max_error_rate_reached(run_info: SchedulerRunInfo) -> bool: + def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: current_error_rate = run_info.errored_requests / run_info.end_number return current_error_rate > run_info.max_error_rate diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index a53b14c2..2dfd4462 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -121,9 +121,13 @@ async def resolve( ... async def get_request( - self, requests_queue: multiprocessing.Queue + self, requests_queue: multiprocessing.Queue, shutdown_event: multiprocessing.Event, shutdonen_check_ ) -> Optional[WorkerProcessRequest[RequestT]]: - return await asyncio.to_thread(requests_queue.get) # type: ignore[attr-defined] + def _get_queue_intermittently(request_queue: multiprocessing.Queue, shutdown_event): + try: + + + return await asyncio.to_thread(_get_queue_intermittently()) # type: ignore[attr-defined] async def send_result( self, @@ -222,6 +226,7 @@ def process_loop_asynchronous( results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, + shutdown_event: multiprocessing.Event, ): async def _process_runner(): pending = asyncio.Semaphore(max_concurrency) From 69a5c9eb5b5a272a0821dfdf569ed0acf5bcaffe Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Mon, 19 May 2025 13:51:05 +0300 Subject: [PATCH 03/37] Revert "wip" This reverts commit 6059af183ebed636af5a2a7eed4707d943f8e7db. --- src/guidellm/scheduler/scheduler.py | 18 ++++++++---------- src/guidellm/scheduler/worker.py | 9 ++------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index cd9231af..33204729 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -64,6 +64,7 @@ def __init__( self.worker = worker self.request_loader = request_loader + self.error_rate: Optional[float] = None async def run( self, @@ -117,6 +118,8 @@ async def run( if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1): raise ValueError(f"Invalid max_error_rate: {max_error_rate}") + shutdown_event = multiprocessing.Event() + with ( multiprocessing.Manager() as manager, ProcessPoolExecutor( @@ -124,7 +127,7 @@ async def run( ) as executor, ): requests_iter: Optional[Iterator[Any]] = None - futures, requests_queue, responses_queue, shutdown_event = await self._start_processes( + futures, requests_queue, responses_queue = await self._start_processes( manager, executor, scheduling_strategy ) run_info, requests_iter, times_iter = self._run_setup( @@ -164,9 +167,7 @@ async def run( ) if iter_result is not None: if self._is_max_error_rate_reached(iter_result.run_info): - logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached, sending " - f"shutdown signal") - shutdown_event.set() + logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!") yield iter_result # yield control to the event loop @@ -190,10 +191,8 @@ async def _start_processes( list[asyncio.Future], multiprocessing.Queue, multiprocessing.Queue, - multiprocessing.Event ]: await self.worker.prepare_multiprocessing() - shutdown_event = multiprocessing.Event() requests_queue = manager.Queue( maxsize=scheduling_strategy.queued_requests_limit ) @@ -230,7 +229,6 @@ async def _start_processes( requests_queue, responses_queue, id_, - shutdown_event, ) ) elif scheduling_strategy.processing_mode == "async": @@ -242,7 +240,6 @@ async def _start_processes( responses_queue, requests_limit, id_, - shutdown_event, ) ) else: @@ -253,7 +250,7 @@ async def _start_processes( await asyncio.sleep(0.1) # give time for processes to start - return futures, requests_queue, responses_queue, shutdown_event + return futures, requests_queue, responses_queue def _run_setup( self, @@ -388,7 +385,8 @@ def _check_result_ready( ) raise ValueError(f"Invalid process response type: {process_response}") - def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: + @staticmethod + def _is_max_error_rate_reached(run_info: SchedulerRunInfo) -> bool: current_error_rate = run_info.errored_requests / run_info.end_number return current_error_rate > run_info.max_error_rate diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index 2dfd4462..a53b14c2 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -121,13 +121,9 @@ async def resolve( ... async def get_request( - self, requests_queue: multiprocessing.Queue, shutdown_event: multiprocessing.Event, shutdonen_check_ + self, requests_queue: multiprocessing.Queue ) -> Optional[WorkerProcessRequest[RequestT]]: - def _get_queue_intermittently(request_queue: multiprocessing.Queue, shutdown_event): - try: - - - return await asyncio.to_thread(_get_queue_intermittently()) # type: ignore[attr-defined] + return await asyncio.to_thread(requests_queue.get) # type: ignore[attr-defined] async def send_result( self, @@ -226,7 +222,6 @@ def process_loop_asynchronous( results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, - shutdown_event: multiprocessing.Event, ): async def _process_runner(): pending = asyncio.Semaphore(max_concurrency) From 7795d2c23b0e3449506764102343eafbe486c5a6 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Mon, 19 May 2025 14:28:32 +0300 Subject: [PATCH 04/37] Handle infinite datasets with constant rate --- src/guidellm/request/loader.py | 8 +++++++- src/guidellm/scheduler/result.py | 2 +- src/guidellm/scheduler/scheduler.py | 17 ++++++++++++++--- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py index 50ab3cca..0e54fc45 100644 --- a/src/guidellm/request/loader.py +++ b/src/guidellm/request/loader.py @@ -21,9 +21,14 @@ "GenerativeRequestLoaderDescription", "RequestLoader", "RequestLoaderDescription", + "InfiniteDatasetError" ] +class InfiniteDatasetError(Exception): + pass + + class RequestLoaderDescription(StandardBaseModel): type_: Literal["request_loader"] = "request_loader" @@ -120,7 +125,8 @@ def __len__(self) -> int: if self.iter_type == "finite": return self.num_unique_items() - raise ValueError(f"Unable to determine length of dataset: {self.data}") + assert self.iter_type == "infinite" + raise InfiniteDatasetError(f"Dataset {self.data} is infinite and thus unable to determine length") @property def description(self) -> GenerativeRequestLoaderDescription: diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py index 9d379422..a340932d 100644 --- a/src/guidellm/scheduler/result.py +++ b/src/guidellm/scheduler/result.py @@ -43,7 +43,7 @@ class SchedulerRunInfo(StandardBaseModel): start_time: float end_time: float - end_number: float + end_number: float # ToDo: Rename to max_requests & change to int (check all references before) processes: int strategy: SchedulingStrategy max_error_rate: float diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 33204729..d0d06a4a 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -15,6 +15,7 @@ from loguru import logger from guidellm.config import settings +from guidellm.request.loader import InfiniteDatasetError from guidellm.scheduler.result import ( SchedulerRequestResult, SchedulerResult, @@ -166,8 +167,12 @@ async def run( run_info, ) if iter_result is not None: - if self._is_max_error_rate_reached(iter_result.run_info): - logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!") + if iter_result.request_info.errored: + if self._is_max_error_rate_reached(iter_result.run_info): + logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!") + else: + cur_error_rate = iter_result.run_info.errored_requests / iter_result.run_info.end_number + logger.debug(f"Current error rate {cur_error_rate}") yield iter_result # yield control to the event loop @@ -271,7 +276,13 @@ def _run_setup( iter_length = len(self.request_loader) # type: ignore[arg-type] if 0 < iter_length < end_number: end_number = iter_length - except Exception: # noqa: BLE001, S110 + except InfiniteDatasetError: # noqa: BLE001, S110 + if scheduling_strategy.type_ == "constant" and max_duration is not None: + end_number = scheduling_strategy.rate * max_duration + else: + # ToDo: Maybe add poison? + raise + except Exception: pass if end_number == math.inf and end_time is None: From 6d688f0bdbcb01b1735fd77971e2c82a28a38e32 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Mon, 19 May 2025 11:41:33 +0300 Subject: [PATCH 05/37] minor bug fixes --- src/guidellm/benchmark/benchmark.py | 2 +- src/guidellm/scheduler/scheduler.py | 42 +++++++++++++++------- src/guidellm/scheduler/worker.py | 54 ++++++++++++++++++++++++----- 3 files changed, 76 insertions(+), 22 deletions(-) diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index 4e2e09a3..50d2f49c 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -701,7 +701,7 @@ def from_stats( *["incomplete"] * len(incomplete), # type: ignore[list-item] *["error"] * len(errored), # type: ignore[list-item] ] - start_time = min(req.start_time for req in total) + start_time = min(req.start_time for req in total) # ToDo: Fix if total is empty end_time = max(req.end_time for req in total) total_with_prompt, total_types_with_prompt = ( diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index d0d06a4a..c58ef363 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -72,7 +72,7 @@ async def run( scheduling_strategy: SchedulingStrategy, max_number: Optional[int] = None, max_duration: Optional[float] = None, - max_error_rate: Optional[float] = 0.05, + max_error_rate: Optional[float] = 0, ) -> AsyncGenerator[ Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None ]: @@ -119,8 +119,6 @@ async def run( if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1): raise ValueError(f"Invalid max_error_rate: {max_error_rate}") - shutdown_event = multiprocessing.Event() - with ( multiprocessing.Manager() as manager, ProcessPoolExecutor( @@ -128,9 +126,11 @@ async def run( ) as executor, ): requests_iter: Optional[Iterator[Any]] = None - futures, requests_queue, responses_queue = await self._start_processes( - manager, executor, scheduling_strategy + futures, requests_queue, responses_queue, shutdown_event = await self._start_processes( + manager, executor, scheduling_strategy, max_error_rate is not None ) + if shutdown_event: + assert not shutdown_event.is_set() run_info, requests_iter, times_iter = self._run_setup( futures, scheduling_strategy, max_number, max_duration, max_error_rate ) @@ -169,10 +169,15 @@ async def run( if iter_result is not None: if iter_result.request_info.errored: if self._is_max_error_rate_reached(iter_result.run_info): - logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) reached!") - else: - cur_error_rate = iter_result.run_info.errored_requests / iter_result.run_info.end_number - logger.debug(f"Current error rate {cur_error_rate}") + logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) " + f"reached, sending shutdown signal") + shutdown_event.set() + break + # else: + # # ToDo: Delete this else clause + # cur_error_rate = iter_result.run_info.errored_requests / iter_result.run_info.end_number + # logger.info(f"Current error rate {cur_error_rate}") + yield iter_result # yield control to the event loop @@ -192,12 +197,15 @@ async def _start_processes( manager, executor: ProcessPoolExecutor, scheduling_strategy: SchedulingStrategy, + create_shutdown_event: bool = False ) -> tuple[ list[asyncio.Future], multiprocessing.Queue, multiprocessing.Queue, + Optional[multiprocessing.Event] ]: await self.worker.prepare_multiprocessing() + shutdown_event = manager.Event() if create_shutdown_event else None requests_queue = manager.Queue( maxsize=scheduling_strategy.queued_requests_limit ) @@ -207,6 +215,7 @@ async def _start_processes( scheduling_strategy.processes_limit, scheduling_strategy.processing_requests_limit, ) + num_processes = 1 requests_limit_split = ( scheduling_strategy.processing_requests_limit // scheduling_strategy.processes_limit @@ -234,6 +243,7 @@ async def _start_processes( requests_queue, responses_queue, id_, + shutdown_event, ) ) elif scheduling_strategy.processing_mode == "async": @@ -245,6 +255,7 @@ async def _start_processes( responses_queue, requests_limit, id_, + shutdown_event, ) ) else: @@ -255,7 +266,7 @@ async def _start_processes( await asyncio.sleep(0.1) # give time for processes to start - return futures, requests_queue, responses_queue + return futures, requests_queue, responses_queue, shutdown_event def _run_setup( self, @@ -278,13 +289,19 @@ def _run_setup( end_number = iter_length except InfiniteDatasetError: # noqa: BLE001, S110 if scheduling_strategy.type_ == "constant" and max_duration is not None: - end_number = scheduling_strategy.rate * max_duration + total_requests_in_max_duration = int(scheduling_strategy.rate * max_duration) + if total_requests_in_max_duration < end_number: + assert total_requests_in_max_duration > 0 + end_number = total_requests_in_max_duration else: - # ToDo: Maybe add poison? + # ToDo: Add poison raise except Exception: pass + if end_number == math.inf and max_error_rate is not None: + raise RuntimeError("Can't ensure max_error_rate since can't calculate total requests count") + if end_number == math.inf and end_time is None: logger.warning( "No end number or end time set, " @@ -409,4 +426,5 @@ async def _stop_processes( for _ in futures: requests_queue.put(None) + logger.debug("Waiting for futures to shut down") await asyncio.gather(*futures) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index a53b14c2..4515fefa 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -2,10 +2,12 @@ import math import multiprocessing import multiprocessing.queues +import queue import time from abc import ABC, abstractmethod from collections.abc import AsyncGenerator from dataclasses import dataclass +from datetime import timedelta from typing import ( Any, Generic, @@ -121,9 +123,23 @@ async def resolve( ... async def get_request( - self, requests_queue: multiprocessing.Queue + self, requests_queue: multiprocessing.Queue, + shutdown_event: Optional[multiprocessing.Event] = None, + process_id: Optional[int] = None, ) -> Optional[WorkerProcessRequest[RequestT]]: - return await asyncio.to_thread(requests_queue.get) # type: ignore[attr-defined] + if shutdown_event is not None and process_id is None: + logger.warning("shutdown_event is not None and process_id is None which makes it hard to debug") + + def _get_queue_intermittently(): + assert shutdown_event is not None + while True: + try: + return requests_queue.get(timeout=timedelta(seconds=1).total_seconds()) + except queue.Empty: + if shutdown_event.is_set(): + logger.info(f"Shutdown signal received in future {process_id}") + return + return await asyncio.to_thread(_get_queue_intermittently if shutdown_event is not None else requests_queue.get) # type: ignore[attr-defined] async def send_result( self, @@ -149,25 +165,25 @@ async def resolve_scheduler_request( scheduled_time=time.time(), process_id=process_id, ) - result: WorkerProcessResult[RequestT, ResponseT] = WorkerProcessResult( + request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = WorkerProcessResult( type_="request_scheduled", request=request, response=None, info=info, ) - asyncio.create_task(self.send_result(results_queue, result)) + asyncio.create_task(self.send_result(results_queue, request_scheduled_result)) if (wait_time := start_time - time.time()) > 0: await asyncio.sleep(wait_time) info.worker_start = time.time() - result = WorkerProcessResult( + request_start_result = WorkerProcessResult( type_="request_start", request=request, response=None, info=info, ) - asyncio.create_task(self.send_result(results_queue, result)) + asyncio.create_task(self.send_result(results_queue, request_start_result)) status, response = await self.resolve(request, timeout_time) info.worker_end = time.time() @@ -190,11 +206,20 @@ def process_loop_synchronous( requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, process_id: int, + shutdown_event: Optional[multiprocessing.Event] = None, ): async def _process_runner(): while ( - process_request := await self.get_request(requests_queue) + process_request := await self.get_request( + requests_queue=requests_queue, + shutdown_event=shutdown_event, + process_id=process_id, + ) ) is not None: + if shutdown_event and shutdown_event.is_set(): + logger.info(f"Shutdown signal received in future {process_id}") + break + dequeued_time = time.time() await self.resolve_scheduler_request( @@ -222,6 +247,7 @@ def process_loop_asynchronous( results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, + shutdown_event: Optional[multiprocessing.Event] = None, ): async def _process_runner(): pending = asyncio.Semaphore(max_concurrency) @@ -230,7 +256,10 @@ async def _process_runner(): raise ValueError("Async worker called with max_concurrency < 1") while ( - process_request := await self.get_request(requests_queue) + process_request := await self.get_request( + requests_queue=requests_queue, + shutdown_event=shutdown_event, + process_id=process_id) ) is not None: dequeued_time = time.time() @@ -240,6 +269,9 @@ def _task_done(_: asyncio.Task): nonlocal pending pending.release() + if shutdown_event and shutdown_event.is_set(): + logger.info(f"Shutdown signal received in future {process_id}") + break task = asyncio.create_task( self.resolve_scheduler_request( request=process_request.request, @@ -314,12 +346,14 @@ def process_loop_synchronous( requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, process_id: int, + shutdown_event: Optional[multiprocessing.Event] = None ): asyncio.run(self.backend.validate()) super().process_loop_synchronous( requests_queue=requests_queue, results_queue=results_queue, process_id=process_id, + shutdown_event=shutdown_event, ) def process_loop_asynchronous( @@ -328,6 +362,7 @@ def process_loop_asynchronous( results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, + shutdown_event: Optional[multiprocessing.Event] = None ): asyncio.run(self.backend.validate()) super().process_loop_asynchronous( @@ -335,6 +370,7 @@ def process_loop_asynchronous( results_queue=results_queue, max_concurrency=max_concurrency, process_id=process_id, + shutdown_event=shutdown_event, ) async def resolve( @@ -375,7 +411,7 @@ async def resolve( request_func, request_kwargs = self._create_request_func_kwargs(request) async def _runner(): - # wrap function so we can enforce timeout and + # wrap function so that we can enforce timeout and # still return the latest state from the backend async for resp in request_func(**request_kwargs): # type: ignore[operator] nonlocal response From ede651aca1bc0fd0de65fc869bea09798f1902c2 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 13:21:19 +0300 Subject: [PATCH 06/37] bugfix / last request not yielded --- src/guidellm/scheduler/scheduler.py | 71 ++++++++++++++++------------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index c58ef363..628a9ac7 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -72,7 +72,7 @@ async def run( scheduling_strategy: SchedulingStrategy, max_number: Optional[int] = None, max_duration: Optional[float] = None, - max_error_rate: Optional[float] = 0, + max_error_rate: Optional[float] = None, ) -> AsyncGenerator[ Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None ]: @@ -140,7 +140,8 @@ async def run( ) try: - while True: + max_error_rate_reached = False + while not max_error_rate_reached: # check errors and raise them for future in futures: if future.done() and (err := future.exception()) is not None: @@ -167,17 +168,13 @@ async def run( run_info, ) if iter_result is not None: - if iter_result.request_info.errored: - if self._is_max_error_rate_reached(iter_result.run_info): - logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) " - f"reached, sending shutdown signal") - shutdown_event.set() - break - # else: - # # ToDo: Delete this else clause - # cur_error_rate = iter_result.run_info.errored_requests / iter_result.run_info.end_number - # logger.info(f"Current error rate {cur_error_rate}") - + if iter_result.request_info.errored \ + and not iter_result.request_info.canceled \ + and self._is_max_error_rate_reached(iter_result.run_info): + shutdown_event.set() + max_error_rate_reached = True + logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) " + f"reached, sending shutdown signal") yield iter_result # yield control to the event loop @@ -280,27 +277,10 @@ def _run_setup( start_time = time.time() times_iter = iter(scheduling_strategy.request_times()) end_time = time.time() + (max_duration or math.inf) - end_number = max_number or math.inf - - try: - # update end number if the request loader is finite and less than max - iter_length = len(self.request_loader) # type: ignore[arg-type] - if 0 < iter_length < end_number: - end_number = iter_length - except InfiniteDatasetError: # noqa: BLE001, S110 - if scheduling_strategy.type_ == "constant" and max_duration is not None: - total_requests_in_max_duration = int(scheduling_strategy.rate * max_duration) - if total_requests_in_max_duration < end_number: - assert total_requests_in_max_duration > 0 - end_number = total_requests_in_max_duration - else: - # ToDo: Add poison - raise - except Exception: - pass + end_number = self._determine_total_requests_count(scheduling_strategy, max_duration, max_error_rate, max_number) if end_number == math.inf and max_error_rate is not None: - raise RuntimeError("Can't ensure max_error_rate since can't calculate total requests count") + logger.warning("max_error_rate will be ignored because end_number can not be determined.") if end_number == math.inf and end_time is None: logger.warning( @@ -319,6 +299,33 @@ def _run_setup( return info, requests_iter, times_iter + def _determine_total_requests_count( + self, + scheduling_strategy: SchedulingStrategy, + max_duration: Optional[float], + max_error_rate: Optional[float], + max_number: Optional[int], + ) -> int: + end_number = max_number or math.inf + try: + # update end number if the request loader is finite and less than max + iter_length = len(self.request_loader) # type: ignore[arg-type] + if 0 < iter_length < end_number: + end_number = iter_length + except InfiniteDatasetError: # noqa: BLE001, S110 + if scheduling_strategy.type_ == "constant" and max_duration is not None: + total_requests_in_max_duration = int(scheduling_strategy.rate * max_duration) + if total_requests_in_max_duration < end_number: + assert total_requests_in_max_duration > 0 + end_number = total_requests_in_max_duration + else: + if max_error_rate: + logger.warning() + raise + except Exception: + pass + return end_number + def _add_requests( self, requests_iter: Optional[Iterator[Any]], From a17117c7dc3d973fd328d6754083dc9471db01b1 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 13:21:47 +0300 Subject: [PATCH 07/37] Add max error rate to readme, CLI & report --- README.md | 2 ++ src/guidellm/__main__.py | 12 ++++++++++++ src/guidellm/benchmark/aggregator.py | 3 +++ src/guidellm/benchmark/benchmark.py | 9 +++++++++ src/guidellm/benchmark/benchmarker.py | 9 +++++++++ src/guidellm/benchmark/entrypoints.py | 2 ++ 6 files changed, 37 insertions(+) diff --git a/README.md b/README.md index a46fd411..416d3cc1 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted. +- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None`, benchmarks will continue regardless of error rate. + - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results. - `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results. diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index f38b11aa..baea9f13 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -163,6 +163,16 @@ def cli(): "If None, will run until max_seconds or the data is exhausted." ), ) +@click.option( + "--max-error-rate", + type=float, + help=( + "The maximum error rate after which a benchmark will stop. " + "Applicable only for finite deterministic scenarios i.e rate_type is 'constant' and 'max_seconds' exists OR " + "'max_requests' exists OR the dataset is finite. " + "If None, benchmarks will continue regardless of error rate." + ), +) @click.option( "--warmup-percent", type=float, @@ -242,6 +252,7 @@ def benchmark( rate, max_seconds, max_requests, + max_error_rate, warmup_percent, cooldown_percent, disable_progress, @@ -267,6 +278,7 @@ def benchmark( rate=rate, max_seconds=max_seconds, max_requests=max_requests, + max_error_rate=max_error_rate, warmup_percent=warmup_percent, cooldown_percent=cooldown_percent, show_progress=not disable_progress, diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py index 9943f169..9fe80be8 100644 --- a/src/guidellm/benchmark/aggregator.py +++ b/src/guidellm/benchmark/aggregator.py @@ -599,6 +599,8 @@ def compile(self) -> GenerativeBenchmark: and return the compiled object. """ successful, incomplete, errored = self._compile_results() + error_rate = self.requests_stats.totals.errored.total / \ + (self.requests_stats.totals.successful + self.requests_stats.totals.errored.total) return GenerativeBenchmark.from_stats( run_id=self.run_id, @@ -625,6 +627,7 @@ def compile(self) -> GenerativeBenchmark: request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean, request_time_delay_avg=self.requests_stats.request_time_delay.mean, request_time_avg=self.requests_stats.request_time.mean, + error_rate=error_rate, ), worker=self.worker_description, requests_loader=self.request_loader_description, diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index 50d2f49c..dee71fb7 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -90,6 +90,9 @@ class BenchmarkArgs(StandardBaseModel): max_duration: Optional[float] = Field( description="The maximum duration in seconds to run this benchmark, if any." ) + max_error_rate: Optional[float] = Field( + description="Maximum error rate after which a benchmark will stop." + ) warmup_number: Optional[int] = Field( description=( "The number of requests to run for the warmup phase of this benchmark, " @@ -213,6 +216,12 @@ class BenchmarkRunStats(StandardBaseModel): "it was completed." ) ) + error_rate: float = Field( + description=( + "The number of errored requests divided by the number of errored requests. This can be higher " + "than max_error_rate (if applicable) cause it does not take into account incomplete requests." + ) + ) class BenchmarkMetrics(StandardBaseModel): diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py index 11b6d245..7da25a3b 100644 --- a/src/guidellm/benchmark/benchmarker.py +++ b/src/guidellm/benchmark/benchmarker.py @@ -74,6 +74,11 @@ class BenchmarkerStrategyLimits(StandardBaseModel): description="Maximum duration (in seconds) to process requests per strategy.", ge=0, ) + max_error_rate: Optional[float] = Field( + description="Maximum error rate after which a sync benchmark will stop", + ge=0, + le=1, + ) warmup_percent_per_strategy: Optional[float] = Field( description="Percentage of requests to use for warmup.", ge=0, @@ -148,6 +153,7 @@ async def run( profile: Profile, max_number_per_strategy: Optional[int], max_duration_per_strategy: Optional[float], + max_error_rate: Optional[float], warmup_percent_per_strategy: Optional[float], cooldown_percent_per_strategy: Optional[float], ) -> AsyncGenerator[ @@ -162,6 +168,7 @@ async def run( requests_loader_size=requests_loader_size, max_number_per_strategy=max_number_per_strategy, max_duration_per_strategy=max_duration_per_strategy, + max_error_rate=max_error_rate, warmup_percent_per_strategy=warmup_percent_per_strategy, cooldown_percent_per_strategy=cooldown_percent_per_strategy, ) @@ -196,6 +203,7 @@ async def run( scheduling_strategy=scheduling_strategy, max_number=max_number_per_strategy, max_duration=max_duration_per_strategy, + max_error_rate=max_error_rate, ): if result.type_ == "run_start": yield BenchmarkerResult( @@ -321,6 +329,7 @@ def create_benchmark_aggregator( strategy=strategy, max_number=limits.max_number, max_duration=limits.max_duration, + max_error_rate=limits.max_error_rate, warmup_number=limits.warmup_number, warmup_duration=limits.warmup_duration, cooldown_number=limits.cooldown_number, diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py index f252cf27..7e4af8c0 100644 --- a/src/guidellm/benchmark/entrypoints.py +++ b/src/guidellm/benchmark/entrypoints.py @@ -41,6 +41,7 @@ async def benchmark_generative_text( rate: Optional[Union[int, float, list[Union[int, float]]]], max_seconds: Optional[float], max_requests: Optional[int], + max_error_rate: Optional[float], warmup_percent: Optional[float], cooldown_percent: Optional[float], show_progress: bool, @@ -107,6 +108,7 @@ async def benchmark_generative_text( profile=profile, max_number_per_strategy=max_requests, max_duration_per_strategy=max_seconds, + max_error_rate=max_error_rate, warmup_percent_per_strategy=warmup_percent, cooldown_percent_per_strategy=cooldown_percent, ): From 34cb6b6cbd3a1c9efe72f7db8dd8578936dd92cd Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 13:43:34 +0300 Subject: [PATCH 08/37] make max_error_rate optional --- src/guidellm/scheduler/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py index a340932d..4159f8f3 100644 --- a/src/guidellm/scheduler/result.py +++ b/src/guidellm/scheduler/result.py @@ -46,7 +46,7 @@ class SchedulerRunInfo(StandardBaseModel): end_number: float # ToDo: Rename to max_requests & change to int (check all references before) processes: int strategy: SchedulingStrategy - max_error_rate: float + max_error_rate: Optional[float] = None created_requests: int = 0 queued_requests: int = 0 From 6289c07e4e8aed4aa7bfbd6223ea401f2ca3993c Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 13:48:12 +0300 Subject: [PATCH 09/37] minor fixes --- src/guidellm/scheduler/scheduler.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 628a9ac7..07d4b2e1 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -277,7 +277,7 @@ def _run_setup( start_time = time.time() times_iter = iter(scheduling_strategy.request_times()) end_time = time.time() + (max_duration or math.inf) - end_number = self._determine_total_requests_count(scheduling_strategy, max_duration, max_error_rate, max_number) + end_number = self._determine_total_requests_count(scheduling_strategy, max_duration, max_number) if end_number == math.inf and max_error_rate is not None: logger.warning("max_error_rate will be ignored because end_number can not be determined.") @@ -303,7 +303,6 @@ def _determine_total_requests_count( self, scheduling_strategy: SchedulingStrategy, max_duration: Optional[float], - max_error_rate: Optional[float], max_number: Optional[int], ) -> int: end_number = max_number or math.inf @@ -318,10 +317,6 @@ def _determine_total_requests_count( if total_requests_in_max_duration < end_number: assert total_requests_in_max_duration > 0 end_number = total_requests_in_max_duration - else: - if max_error_rate: - logger.warning() - raise except Exception: pass return end_number From d5ee01822affd222141d2e6845b921d8f09e467f Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 14:12:11 +0300 Subject: [PATCH 10/37] reprot error rate bugfix --- src/guidellm/benchmark/aggregator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py index 9fe80be8..b66ae1f7 100644 --- a/src/guidellm/benchmark/aggregator.py +++ b/src/guidellm/benchmark/aggregator.py @@ -600,7 +600,7 @@ def compile(self) -> GenerativeBenchmark: """ successful, incomplete, errored = self._compile_results() error_rate = self.requests_stats.totals.errored.total / \ - (self.requests_stats.totals.successful + self.requests_stats.totals.errored.total) + (self.requests_stats.totals.successful.total + self.requests_stats.totals.errored.total) return GenerativeBenchmark.from_stats( run_id=self.run_id, From ce13ef7294d448c0d03a32ef4a70699188617942 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 14:12:24 +0300 Subject: [PATCH 11/37] add current error rate log --- src/guidellm/scheduler/scheduler.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 07d4b2e1..1edc4286 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -168,13 +168,17 @@ async def run( run_info, ) if iter_result is not None: - if iter_result.request_info.errored \ - and not iter_result.request_info.canceled \ - and self._is_max_error_rate_reached(iter_result.run_info): - shutdown_event.set() - max_error_rate_reached = True - logger.info(f"Max_error rate of ({iter_result.run_info.max_error_rate}) " - f"reached, sending shutdown signal") + if iter_result.request_info.errored and not iter_result.request_info.canceled: + current_error_rate = run_info.errored_requests / run_info.end_number + is_over_max_error_rate = run_info.max_error_rate < current_error_rate + + if is_over_max_error_rate: + shutdown_event.set() + max_error_rate_reached = True + logger.info(f"Max error rate of ({iter_result.run_info.max_error_rate}) " + f"reached, sending shutdown signal") + else: + logger.debug(f"Current error rate: {current_error_rate}") yield iter_result # yield control to the event loop @@ -415,11 +419,6 @@ def _check_result_ready( ) raise ValueError(f"Invalid process response type: {process_response}") - @staticmethod - def _is_max_error_rate_reached(run_info: SchedulerRunInfo) -> bool: - current_error_rate = run_info.errored_requests / run_info.end_number - return current_error_rate > run_info.max_error_rate - async def _stop_processes( self, futures: list[asyncio.Future], From 9a68a7687360048f62b5fb880a9bce95fe1313ea Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 14:19:03 +0300 Subject: [PATCH 12/37] remove todo --- src/guidellm/scheduler/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py index 4159f8f3..f899f54a 100644 --- a/src/guidellm/scheduler/result.py +++ b/src/guidellm/scheduler/result.py @@ -43,7 +43,7 @@ class SchedulerRunInfo(StandardBaseModel): start_time: float end_time: float - end_number: float # ToDo: Rename to max_requests & change to int (check all references before) + end_number: float processes: int strategy: SchedulingStrategy max_error_rate: Optional[float] = None From 6dd313de3b4275ec87fdb9c76685602d6a806e76 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 15:58:03 +0300 Subject: [PATCH 13/37] Fix tests --- src/guidellm/benchmark/output.py | 1 + src/guidellm/scheduler/scheduler.py | 6 ++++-- tests/unit/benchmark/test_output.py | 2 +- tests/unit/mock_benchmark.py | 2 ++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py index 4847160d..33b1efc2 100644 --- a/src/guidellm/benchmark/output.py +++ b/src/guidellm/benchmark/output.py @@ -419,6 +419,7 @@ def benchmarks_args_str(self) -> str: { "max_number": args.max_number, "max_duration": args.max_duration, + "max_error_rate": args.max_error_rate, "warmup_number": args.warmup_number, "warmup_duration": args.warmup_duration, "cooldown_number": args.cooldown_number, diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 1edc4286..3dd873d0 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -315,13 +315,15 @@ def _determine_total_requests_count( iter_length = len(self.request_loader) # type: ignore[arg-type] if 0 < iter_length < end_number: end_number = iter_length - except InfiniteDatasetError: # noqa: BLE001, S110 + except InfiniteDatasetError: + # Only when RPS is constant and duration is capped we can determine the total + # amount of requests that are supposed to be sent if scheduling_strategy.type_ == "constant" and max_duration is not None: total_requests_in_max_duration = int(scheduling_strategy.rate * max_duration) if total_requests_in_max_duration < end_number: assert total_requests_in_max_duration > 0 end_number = total_requests_in_max_duration - except Exception: + except Exception: # noqa: BLE001, S110 pass return end_number diff --git a/tests/unit/benchmark/test_output.py b/tests/unit/benchmark/test_output.py index 9076834b..e3114491 100644 --- a/tests/unit/benchmark/test_output.py +++ b/tests/unit/benchmark/test_output.py @@ -113,7 +113,7 @@ def test_console_benchmarks_args_str(): mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] assert console.benchmarks_args_str == ( - "max_number=None, max_duration=10.0, warmup_number=None, " + "max_number=None, max_duration=10.0, max_error_rate=0.05, warmup_number=None, " "warmup_duration=None, cooldown_number=None, cooldown_duration=None" ) diff --git a/tests/unit/mock_benchmark.py b/tests/unit/mock_benchmark.py index 81364fa1..3c360c68 100644 --- a/tests/unit/mock_benchmark.py +++ b/tests/unit/mock_benchmark.py @@ -221,6 +221,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark: strategy=SynchronousStrategy(), max_number=None, max_duration=10.0, + max_error_rate=0.05, warmup_number=None, warmup_duration=None, cooldown_number=None, @@ -245,6 +246,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark: request_start_time_targeted_delay_avg=1.2827096836907523, request_time_delay_avg=0.0004316908972603934, request_time_avg=1.426228676523481, + error_rate=0.345346, ), worker=GenerativeRequestsWorkerDescription( backend_type="openai_http", From 3697b308cd87c34e84370fdd6da04ef29c1a5ae9 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 16:14:49 +0300 Subject: [PATCH 14/37] Pre CR fixes --- README.md | 2 +- src/guidellm/__main__.py | 2 +- src/guidellm/benchmark/benchmark.py | 2 +- src/guidellm/scheduler/scheduler.py | 5 +++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 416d3cc1..0988c70e 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ The `guidellm benchmark` command is used to run benchmarks against a generative - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted. -- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None`, benchmarks will continue regardless of error rate. +- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None` or not applicable, benchmarks will continue regardless of error rate. - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results. diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index baea9f13..5628857b 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -170,7 +170,7 @@ def cli(): "The maximum error rate after which a benchmark will stop. " "Applicable only for finite deterministic scenarios i.e rate_type is 'constant' and 'max_seconds' exists OR " "'max_requests' exists OR the dataset is finite. " - "If None, benchmarks will continue regardless of error rate." + "If None or not applicable, benchmarks will continue regardless of error rate." ), ) @click.option( diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index dee71fb7..dd391bfc 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -710,7 +710,7 @@ def from_stats( *["incomplete"] * len(incomplete), # type: ignore[list-item] *["error"] * len(errored), # type: ignore[list-item] ] - start_time = min(req.start_time for req in total) # ToDo: Fix if total is empty + start_time = min(req.start_time for req in total) end_time = max(req.end_time for req in total) total_with_prompt, total_types_with_prompt = ( diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 3dd873d0..c92bdc76 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -102,7 +102,8 @@ async def run( If None, then no limit is set and either the iterator must be exhaustible or the max_number must be set. :param max_error_rate: The maximum error rate after which the scheduler shuts down. - If not provided a default of 5% i.e 0.05 is used. + Only applicable in benchmarks with finite deterministic number of requests. + If None or not applicable then scheduler will continue regardless of errors. :return: An asynchronous generator that yields SchedulerResult objects. Each SchedulerResult object contains information about the request, the response, and the run information. @@ -130,7 +131,7 @@ async def run( manager, executor, scheduling_strategy, max_error_rate is not None ) if shutdown_event: - assert not shutdown_event.is_set() + assert not shutdown_event.is_set(), "shutdown_event is set before starting scheduling" run_info, requests_iter, times_iter = self._run_setup( futures, scheduling_strategy, max_number, max_duration, max_error_rate ) From 2fe64c7092265be8e9a2f6543fc7af9968930703 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 16:19:04 +0300 Subject: [PATCH 15/37] CR Fixes --- src/guidellm/benchmark/benchmarker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py index 7da25a3b..ecb721f7 100644 --- a/src/guidellm/benchmark/benchmarker.py +++ b/src/guidellm/benchmark/benchmarker.py @@ -75,7 +75,7 @@ class BenchmarkerStrategyLimits(StandardBaseModel): ge=0, ) max_error_rate: Optional[float] = Field( - description="Maximum error rate after which a sync benchmark will stop", + description="Maximum error rate after which a benchmark will stop", ge=0, le=1, ) From b54ab14d668a8af007cf9382b29917ccee994764 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 18:14:40 +0300 Subject: [PATCH 16/37] Lint fixes --- src/guidellm/__main__.py | 3 +- src/guidellm/benchmark/aggregator.py | 10 ++- src/guidellm/benchmark/benchmark.py | 6 +- src/guidellm/request/__init__.py | 2 + src/guidellm/request/loader.py | 11 ++-- src/guidellm/scheduler/scheduler.py | 93 ++++++++++++++++------------ src/guidellm/scheduler/worker.py | 21 +++++-- 7 files changed, 93 insertions(+), 53 deletions(-) diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 5628857b..8a1b9ff0 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -168,7 +168,8 @@ def cli(): type=float, help=( "The maximum error rate after which a benchmark will stop. " - "Applicable only for finite deterministic scenarios i.e rate_type is 'constant' and 'max_seconds' exists OR " + "Applicable only for finite deterministic scenarios i.e " + "rate_type is 'constant' and 'max_seconds' exists OR " "'max_requests' exists OR the dataset is finite. " "If None or not applicable, benchmarks will continue regardless of error rate." ), diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py index b66ae1f7..73ae622a 100644 --- a/src/guidellm/benchmark/aggregator.py +++ b/src/guidellm/benchmark/aggregator.py @@ -599,8 +599,8 @@ def compile(self) -> GenerativeBenchmark: and return the compiled object. """ successful, incomplete, errored = self._compile_results() - error_rate = self.requests_stats.totals.errored.total / \ - (self.requests_stats.totals.successful.total + self.requests_stats.totals.errored.total) + + error_rate = self._calculate_error_rate() return GenerativeBenchmark.from_stats( run_id=self.run_id, @@ -634,6 +634,12 @@ def compile(self) -> GenerativeBenchmark: extras=self.extras, ) + def _calculate_error_rate(self) -> float: + total_successful = self.requests_stats.totals.successful.total + total_errored = self.requests_stats.totals.errored.total + total_sent = total_errored + total_successful + return total_errored / total_sent + def _compile_results( self, ) -> tuple[ diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index dd391bfc..40ffefba 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -218,8 +218,10 @@ class BenchmarkRunStats(StandardBaseModel): ) error_rate: float = Field( description=( - "The number of errored requests divided by the number of errored requests. This can be higher " - "than max_error_rate (if applicable) cause it does not take into account incomplete requests." + "The number of errored requests divided by the number " + "of errored requests. This can be higher than max_error_rate " + "(if applicable) cause it does not take into " + "account incomplete requests." ) ) diff --git a/src/guidellm/request/__init__.py b/src/guidellm/request/__init__.py index db3059cc..606fb897 100644 --- a/src/guidellm/request/__init__.py +++ b/src/guidellm/request/__init__.py @@ -1,6 +1,7 @@ from .loader import ( GenerativeRequestLoader, GenerativeRequestLoaderDescription, + GetInfiniteDatasetLengthError, RequestLoader, RequestLoaderDescription, ) @@ -10,6 +11,7 @@ "GenerationRequest", "GenerativeRequestLoader", "GenerativeRequestLoaderDescription", + "GetInfiniteDatasetLengthError", "RequestLoader", "RequestLoaderDescription", ] diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py index 0e54fc45..62bd17ea 100644 --- a/src/guidellm/request/loader.py +++ b/src/guidellm/request/loader.py @@ -19,13 +19,13 @@ __all__ = [ "GenerativeRequestLoader", "GenerativeRequestLoaderDescription", + "GetInfiniteDatasetLengthError", "RequestLoader", "RequestLoaderDescription", - "InfiniteDatasetError" ] -class InfiniteDatasetError(Exception): +class GetInfiniteDatasetLengthError(Exception): pass @@ -125,8 +125,11 @@ def __len__(self) -> int: if self.iter_type == "finite": return self.num_unique_items() - assert self.iter_type == "infinite" - raise InfiniteDatasetError(f"Dataset {self.data} is infinite and thus unable to determine length") + if self.iter_type != "infinite": + raise ValueError(f"Invalid iter_type {self.iter_type}") + raise GetInfiniteDatasetLengthError(f"Dataset {self.data} is " + f"infinite and thus " + f"unable to determine length") @property def description(self) -> GenerativeRequestLoaderDescription: diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index c92bdc76..6bdcbcfe 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -15,7 +15,7 @@ from loguru import logger from guidellm.config import settings -from guidellm.request.loader import InfiniteDatasetError +from guidellm.request.loader import GetInfiniteDatasetLengthError from guidellm.scheduler.result import ( SchedulerRequestResult, SchedulerResult, @@ -101,24 +101,15 @@ async def run( :param max_duration: The maximum duration for the scheduling run. If None, then no limit is set and either the iterator must be exhaustible or the max_number must be set. - :param max_error_rate: The maximum error rate after which the scheduler shuts down. + :param max_error_rate: The maximum error rate after which the + scheduler shuts down. Only applicable in benchmarks with finite deterministic number of requests. If None or not applicable then scheduler will continue regardless of errors. :return: An asynchronous generator that yields SchedulerResult objects. Each SchedulerResult object contains information about the request, the response, and the run information. """ - if scheduling_strategy is None or not isinstance( - scheduling_strategy, SchedulingStrategy - ): - raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}") - - if max_number is not None and max_number < 1: - raise ValueError(f"Invalid max_number: {max_number}") - if max_duration is not None and max_duration < 0: - raise ValueError(f"Invalid max_duration: {max_duration}") - if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1): - raise ValueError(f"Invalid max_error_rate: {max_error_rate}") + self._validate_scheduler_params(scheduling_strategy, max_duration, max_error_rate, max_number) with ( multiprocessing.Manager() as manager, @@ -127,11 +118,13 @@ async def run( ) as executor, ): requests_iter: Optional[Iterator[Any]] = None - futures, requests_queue, responses_queue, shutdown_event = await self._start_processes( - manager, executor, scheduling_strategy, max_error_rate is not None - ) - if shutdown_event: - assert not shutdown_event.is_set(), "shutdown_event is set before starting scheduling" + futures, requests_queue, responses_queue, shutdown_event = \ + await self._start_processes( + manager, executor, scheduling_strategy, max_error_rate is not None) + if shutdown_event and shutdown_event.is_set(): + raise RuntimeError( + "shutdown_event is set before starting scheduling" + ) run_info, requests_iter, times_iter = self._run_setup( futures, scheduling_strategy, max_number, max_duration, max_error_rate ) @@ -169,17 +162,14 @@ async def run( run_info, ) if iter_result is not None: - if iter_result.request_info.errored and not iter_result.request_info.canceled: - current_error_rate = run_info.errored_requests / run_info.end_number - is_over_max_error_rate = run_info.max_error_rate < current_error_rate - - if is_over_max_error_rate: - shutdown_event.set() - max_error_rate_reached = True - logger.info(f"Max error rate of ({iter_result.run_info.max_error_rate}) " - f"reached, sending shutdown signal") - else: - logger.debug(f"Current error rate: {current_error_rate}") + if iter_result.request_info.errored \ + and not iter_result.request_info.canceled \ + and self._is_max_error_rate_reached(iter_result.run_info): + shutdown_event.set() + max_error_rate_reached = True + logger.info(f"Max error rate of " + f"({iter_result.run_info.max_error_rate}) " + f"reached, sending shutdown signal") yield iter_result # yield control to the event loop @@ -194,6 +184,28 @@ async def run( await self._stop_processes(futures, requests_queue) + def _validate_scheduler_params( + self, + scheduling_strategy: SchedulingStrategy, + max_duration: Optional[float], + max_error_rate: Optional[float], + max_number: Optional[int] + ) -> None: + if scheduling_strategy is None or not isinstance( + scheduling_strategy, SchedulingStrategy + ): + raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}") + if max_number is not None and max_number < 1: + raise ValueError(f"Invalid max_number: {max_number}") + if max_duration is not None and max_duration < 0: + raise ValueError(f"Invalid max_duration: {max_duration}") + if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1): + raise ValueError(f"Invalid max_error_rate: {max_error_rate}") + + def _is_max_error_rate_reached(self, run_info) -> bool: + current_error_rate = run_info.errored_requests / run_info.end_number + return run_info.max_error_rate < current_error_rate + async def _start_processes( self, manager, @@ -282,10 +294,13 @@ def _run_setup( start_time = time.time() times_iter = iter(scheduling_strategy.request_times()) end_time = time.time() + (max_duration or math.inf) - end_number = self._determine_total_requests_count(scheduling_strategy, max_duration, max_number) + end_number = self._determine_total_requests_count( + scheduling_strategy, max_duration, max_number + ) if end_number == math.inf and max_error_rate is not None: - logger.warning("max_error_rate will be ignored because end_number can not be determined.") + logger.warning("max_error_rate will be ignored " + "because end_number can not be determined.") if end_number == math.inf and end_time is None: logger.warning( @@ -312,17 +327,19 @@ def _determine_total_requests_count( ) -> int: end_number = max_number or math.inf try: - # update end number if the request loader is finite and less than max + # update end_number if the request_loader is finite and less than max_number iter_length = len(self.request_loader) # type: ignore[arg-type] if 0 < iter_length < end_number: end_number = iter_length - except InfiniteDatasetError: - # Only when RPS is constant and duration is capped we can determine the total - # amount of requests that are supposed to be sent + except GetInfiniteDatasetLengthError: + # Only when RPS is constant and duration is + # capped we can determine the total amount of requests + # that are supposed to be sent if scheduling_strategy.type_ == "constant" and max_duration is not None: - total_requests_in_max_duration = int(scheduling_strategy.rate * max_duration) - if total_requests_in_max_duration < end_number: - assert total_requests_in_max_duration > 0 + total_requests_in_max_duration = int( + scheduling_strategy.rate * max_duration + ) + if 0 < total_requests_in_max_duration < end_number: end_number = total_requests_in_max_duration except Exception: # noqa: BLE001, S110 pass diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index 4515fefa..800207a0 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -128,18 +128,26 @@ async def get_request( process_id: Optional[int] = None, ) -> Optional[WorkerProcessRequest[RequestT]]: if shutdown_event is not None and process_id is None: - logger.warning("shutdown_event is not None and process_id is None which makes it hard to debug") + logger.warning("shutdown_event is not None and process_id " + "is None which makes it hard to debug") def _get_queue_intermittently(): - assert shutdown_event is not None + if shutdown_event is None: + raise ValueError("Shouldn't use _get_queue_intermittently " + "if there's no shutdown_even") while True: try: - return requests_queue.get(timeout=timedelta(seconds=1).total_seconds()) + get_timeout = timedelta(seconds=1).total_seconds() + return requests_queue.get(timeout=get_timeout) except queue.Empty: if shutdown_event.is_set(): logger.info(f"Shutdown signal received in future {process_id}") - return - return await asyncio.to_thread(_get_queue_intermittently if shutdown_event is not None else requests_queue.get) # type: ignore[attr-defined] + return None + + get_method = _get_queue_intermittently \ + if shutdown_event is not None \ + else requests_queue.get + return await asyncio.to_thread(get_method) # type: ignore[attr-defined] async def send_result( self, @@ -165,7 +173,8 @@ async def resolve_scheduler_request( scheduled_time=time.time(), process_id=process_id, ) - request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = WorkerProcessResult( + request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = \ + WorkerProcessResult( type_="request_scheduled", request=request, response=None, From b502c9488cd497831a821f91291a42eecfe01c33 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 21 May 2025 18:16:22 +0300 Subject: [PATCH 17/37] Lint fixes --- src/guidellm/scheduler/scheduler.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 6bdcbcfe..db505181 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -109,7 +109,10 @@ async def run( Each SchedulerResult object contains information about the request, the response, and the run information. """ - self._validate_scheduler_params(scheduling_strategy, max_duration, max_error_rate, max_number) + self._validate_scheduler_params(scheduling_strategy, + max_duration, + max_error_rate, + max_number) with ( multiprocessing.Manager() as manager, @@ -163,8 +166,8 @@ async def run( ) if iter_result is not None: if iter_result.request_info.errored \ - and not iter_result.request_info.canceled \ - and self._is_max_error_rate_reached(iter_result.run_info): + and not iter_result.request_info.canceled \ + and self._is_max_error_rate_reached(iter_result.run_info): shutdown_event.set() max_error_rate_reached = True logger.info(f"Max error rate of " From 332ef08a5084c3846a38d444c815196cf3190266 Mon Sep 17 00:00:00 2001 From: markvaykhansky Date: Wed, 21 May 2025 19:14:21 +0300 Subject: [PATCH 18/37] better var name --- src/guidellm/benchmark/aggregator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py index 73ae622a..cd725326 100644 --- a/src/guidellm/benchmark/aggregator.py +++ b/src/guidellm/benchmark/aggregator.py @@ -637,8 +637,8 @@ def compile(self) -> GenerativeBenchmark: def _calculate_error_rate(self) -> float: total_successful = self.requests_stats.totals.successful.total total_errored = self.requests_stats.totals.errored.total - total_sent = total_errored + total_successful - return total_errored / total_sent + total_finished = total_errored + total_successful + return total_errored / total_finished def _compile_results( self, From c2fd813233fe0cdd253796205464e6e6167deeff Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Thu, 22 May 2025 08:33:33 +0300 Subject: [PATCH 19/37] Type fixes, typos & bugfixes --- src/guidellm/__main__.py | 4 ++-- src/guidellm/scheduler/scheduler.py | 16 ++++++++++++---- src/guidellm/scheduler/worker.py | 15 ++++++++------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 8a1b9ff0..bfa566b2 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -179,7 +179,7 @@ def cli(): type=float, default=None, help=( - "The percent of the benchmark (based on max-seconds, max-requets, " + "The percent of the benchmark (based on max-seconds, max-requests, " "or lenth of dataset) to run as a warmup and not include in the final results. " "Defaults to None." ), @@ -188,7 +188,7 @@ def cli(): "--cooldown-percent", type=float, help=( - "The percent of the benchmark (based on max-seconds, max-requets, or lenth " + "The percent of the benchmark (based on max-seconds, max-requests, or length " "of dataset) to run as a cooldown and not include in the final results. " "Defaults to None." ), diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index db505181..ceffecd3 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -5,6 +5,7 @@ import time from collections.abc import AsyncGenerator, Iterable, Iterator from concurrent.futures import ProcessPoolExecutor +from multiprocessing.synchronize import Event as MultiprocessingEvent from typing import ( Any, Generic, @@ -168,11 +169,15 @@ async def run( if iter_result.request_info.errored \ and not iter_result.request_info.canceled \ and self._is_max_error_rate_reached(iter_result.run_info): + if shutdown_event is None: + raise RuntimeError("We've reached max_error_rate " + "but shutdown_event is corrupt") shutdown_event.set() max_error_rate_reached = True logger.info(f"Max error rate of " f"({iter_result.run_info.max_error_rate}) " f"reached, sending shutdown signal") + logger.info("Itter is not None") yield iter_result # yield control to the event loop @@ -205,8 +210,12 @@ def _validate_scheduler_params( if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1): raise ValueError(f"Invalid max_error_rate: {max_error_rate}") - def _is_max_error_rate_reached(self, run_info) -> bool: + def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: + if run_info.max_error_rate is None: + return False current_error_rate = run_info.errored_requests / run_info.end_number + logger.info(f"Current error rate {current_error_rate} " + f"i.e total_finished [success / error] / max total possible") return run_info.max_error_rate < current_error_rate async def _start_processes( @@ -219,7 +228,7 @@ async def _start_processes( list[asyncio.Future], multiprocessing.Queue, multiprocessing.Queue, - Optional[multiprocessing.Event] + Optional[MultiprocessingEvent] ]: await self.worker.prepare_multiprocessing() shutdown_event = manager.Event() if create_shutdown_event else None @@ -232,7 +241,6 @@ async def _start_processes( scheduling_strategy.processes_limit, scheduling_strategy.processing_requests_limit, ) - num_processes = 1 requests_limit_split = ( scheduling_strategy.processing_requests_limit // scheduling_strategy.processes_limit @@ -327,7 +335,7 @@ def _determine_total_requests_count( scheduling_strategy: SchedulingStrategy, max_duration: Optional[float], max_number: Optional[int], - ) -> int: + ) -> Union[int, float]: end_number = max_number or math.inf try: # update end_number if the request_loader is finite and less than max_number diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index 800207a0..f4072c5d 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -1,6 +1,5 @@ import asyncio import math -import multiprocessing import multiprocessing.queues import queue import time @@ -8,6 +7,7 @@ from collections.abc import AsyncGenerator from dataclasses import dataclass from datetime import timedelta +from multiprocessing.synchronize import Event as MultiprocessingEvent from typing import ( Any, Generic, @@ -124,7 +124,7 @@ async def resolve( async def get_request( self, requests_queue: multiprocessing.Queue, - shutdown_event: Optional[multiprocessing.Event] = None, + shutdown_event: Optional[MultiprocessingEvent] = None, process_id: Optional[int] = None, ) -> Optional[WorkerProcessRequest[RequestT]]: if shutdown_event is not None and process_id is None: @@ -186,7 +186,8 @@ async def resolve_scheduler_request( await asyncio.sleep(wait_time) info.worker_start = time.time() - request_start_result = WorkerProcessResult( + request_start_result: WorkerProcessResult[RequestT, ResponseT] = \ + WorkerProcessResult( type_="request_start", request=request, response=None, @@ -215,7 +216,7 @@ def process_loop_synchronous( requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, process_id: int, - shutdown_event: Optional[multiprocessing.Event] = None, + shutdown_event: Optional[MultiprocessingEvent] = None, ): async def _process_runner(): while ( @@ -256,7 +257,7 @@ def process_loop_asynchronous( results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, - shutdown_event: Optional[multiprocessing.Event] = None, + shutdown_event: Optional[MultiprocessingEvent] = None, ): async def _process_runner(): pending = asyncio.Semaphore(max_concurrency) @@ -355,7 +356,7 @@ def process_loop_synchronous( requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, process_id: int, - shutdown_event: Optional[multiprocessing.Event] = None + shutdown_event: Optional[MultiprocessingEvent] = None ): asyncio.run(self.backend.validate()) super().process_loop_synchronous( @@ -371,7 +372,7 @@ def process_loop_asynchronous( results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, - shutdown_event: Optional[multiprocessing.Event] = None + shutdown_event: Optional[MultiprocessingEvent] = None ): asyncio.run(self.backend.validate()) super().process_loop_asynchronous( From 4bda8cf20c118ca3ecf0dc6b3d11813a0556e5db Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Thu, 22 May 2025 10:11:42 +0300 Subject: [PATCH 20/37] Remove spammy log + bugfix --- src/guidellm/scheduler/scheduler.py | 1 - src/guidellm/scheduler/worker.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index ceffecd3..4097cfed 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -177,7 +177,6 @@ async def run( logger.info(f"Max error rate of " f"({iter_result.run_info.max_error_rate}) " f"reached, sending shutdown signal") - logger.info("Itter is not None") yield iter_result # yield control to the event loop diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index f4072c5d..bc77a11b 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -281,6 +281,7 @@ def _task_done(_: asyncio.Task): if shutdown_event and shutdown_event.is_set(): logger.info(f"Shutdown signal received in future {process_id}") + pending.release() break task = asyncio.create_task( self.resolve_scheduler_request( From 26319a5c89fba8105709a46811fd95d5b5f1f33d Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Thu, 22 May 2025 15:39:57 +0300 Subject: [PATCH 21/37] Sleep interminetly --- src/guidellm/scheduler/worker.py | 47 +++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index bc77a11b..41b4423d 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -165,6 +165,7 @@ async def resolve_scheduler_request( timeout_time: float, results_queue: multiprocessing.Queue, process_id: int, + shutdown_event: Optional[MultiprocessingEvent] = None, ): info = SchedulerRequestInfo( targeted_start_time=start_time, @@ -183,7 +184,21 @@ async def resolve_scheduler_request( asyncio.create_task(self.send_result(results_queue, request_scheduled_result)) if (wait_time := start_time - time.time()) > 0: - await asyncio.sleep(wait_time) + if shutdown_event is None: + await asyncio.sleep(wait_time) + else: + shutdown_signal_received = \ + await self._sleep_intermittently_until_timestamp_or_shutdown( + sleep_until_timestamp=start_time, + shutdown_event=shutdown_event, + ) + if shutdown_signal_received: + logger.info( + "Received shutdown signal " + "while waiting to start " + f"|| Process ID {process_id}" + ) + return info.worker_start = time.time() request_start_result: WorkerProcessResult[RequestT, ResponseT] = \ @@ -211,6 +226,18 @@ async def resolve_scheduler_request( ) asyncio.create_task(self.send_result(results_queue, result)) + async def _sleep_intermittently_until_timestamp_or_shutdown( + self, + sleep_until_timestamp: float, + shutdown_event: MultiprocessingEvent, + ) -> bool: + delta = timedelta(seconds=10).total_seconds() + while time.time() < sleep_until_timestamp: + await asyncio.sleep(delta) + if shutdown_event.is_set(): + return True + return False + def process_loop_synchronous( self, requests_queue: multiprocessing.Queue, @@ -240,6 +267,7 @@ async def _process_runner(): timeout_time=process_request.timeout_time, results_queue=results_queue, process_id=process_id, + shutdown_event=shutdown_event, ) try: @@ -271,10 +299,26 @@ async def _process_runner(): shutdown_event=shutdown_event, process_id=process_id) ) is not None: + if shutdown_event and shutdown_event.is_set(): + logger.error("This shouldn't happen! " + "We should catch the " + "shutdown in the get wrapper") + logger.info(f"Shutdown signal received" + f" in future {process_id}") + break + dequeued_time = time.time() + logger.debug(f"Dequeued Process ID {process_id} || " + f"Timestamp {dequeued_time} || " + f"Semaphore {pending._value}/{max_concurrency}") await pending.acquire() + lock_acquired_at = time.time() + logger.debug(f"Lock acquired Process ID {process_id} ||" + f" Timestamp {lock_acquired_at} ||" + f" Semaphore {pending._value}/{max_concurrency}") + def _task_done(_: asyncio.Task): nonlocal pending pending.release() @@ -292,6 +336,7 @@ def _task_done(_: asyncio.Task): timeout_time=process_request.timeout_time, results_queue=results_queue, process_id=process_id, + shutdown_event=shutdown_event, ) ) task.add_done_callback(_task_done) From 09925a40c9d3ef1fc4e6ba1d23ed88442fd173ed Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Thu, 22 May 2025 15:43:51 +0300 Subject: [PATCH 22/37] Add missing error log --- src/guidellm/scheduler/worker.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index 41b4423d..6883f739 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -254,6 +254,9 @@ async def _process_runner(): ) ) is not None: if shutdown_event and shutdown_event.is_set(): + logger.error("This shouldn't happen! " + "We should catch the " + "shutdown in the get wrapper") logger.info(f"Shutdown signal received in future {process_id}") break From fa562587b0bfdcf2441eb6ef05a88e21c1b97bd0 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Thu, 22 May 2025 16:22:12 +0300 Subject: [PATCH 23/37] linting fixes --- src/guidellm/scheduler/worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index 6883f739..f37b7708 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -313,14 +313,14 @@ async def _process_runner(): dequeued_time = time.time() logger.debug(f"Dequeued Process ID {process_id} || " f"Timestamp {dequeued_time} || " - f"Semaphore {pending._value}/{max_concurrency}") + f"Semaphore {pending._value}/{max_concurrency}") # noqa: SLF001 await pending.acquire() lock_acquired_at = time.time() logger.debug(f"Lock acquired Process ID {process_id} ||" f" Timestamp {lock_acquired_at} ||" - f" Semaphore {pending._value}/{max_concurrency}") + f" Semaphore {pending._value}/{max_concurrency}") # noqa: SLF001 def _task_done(_: asyncio.Task): nonlocal pending From 3361d2f28e41229696e2586894f7bfa4ad16bf19 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Sun, 1 Jun 2025 12:55:31 +0300 Subject: [PATCH 24/37] WIP CR Fixes --- src/guidellm/benchmark/benchmark.py | 3 +- src/guidellm/request/loader.py | 6 +- src/guidellm/scheduler/scheduler.py | 82 ++++---- src/guidellm/scheduler/worker.py | 288 +++++++++++++--------------- 4 files changed, 188 insertions(+), 191 deletions(-) diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index 40ffefba..d33e6a56 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -219,7 +219,8 @@ class BenchmarkRunStats(StandardBaseModel): error_rate: float = Field( description=( "The number of errored requests divided by the number " - "of errored requests. This can be higher than max_error_rate " + "of successful and errored requests. " + "This can be higher than max_error_rate " "(if applicable) cause it does not take into " "account incomplete requests." ) diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py index 62bd17ea..26a06eb7 100644 --- a/src/guidellm/request/loader.py +++ b/src/guidellm/request/loader.py @@ -127,9 +127,9 @@ def __len__(self) -> int: if self.iter_type != "infinite": raise ValueError(f"Invalid iter_type {self.iter_type}") - raise GetInfiniteDatasetLengthError(f"Dataset {self.data} is " - f"infinite and thus " - f"unable to determine length") + raise GetInfiniteDatasetLengthError( + f"Dataset {self.data} is infinite and thus unable to determine length" + ) @property def description(self) -> GenerativeRequestLoaderDescription: diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 4097cfed..102ebd69 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -110,10 +110,9 @@ async def run( Each SchedulerResult object contains information about the request, the response, and the run information. """ - self._validate_scheduler_params(scheduling_strategy, - max_duration, - max_error_rate, - max_number) + self._validate_scheduler_params( + scheduling_strategy, max_duration, max_error_rate, max_number + ) with ( multiprocessing.Manager() as manager, @@ -122,13 +121,16 @@ async def run( ) as executor, ): requests_iter: Optional[Iterator[Any]] = None - futures, requests_queue, responses_queue, shutdown_event = \ - await self._start_processes( - manager, executor, scheduling_strategy, max_error_rate is not None) + ( + futures, + requests_queue, + responses_queue, + shutdown_event, + ) = await self._start_processes( + manager, executor, scheduling_strategy, max_error_rate is not None + ) if shutdown_event and shutdown_event.is_set(): - raise RuntimeError( - "shutdown_event is set before starting scheduling" - ) + raise RuntimeError("shutdown_event is set before starting scheduling") run_info, requests_iter, times_iter = self._run_setup( futures, scheduling_strategy, max_number, max_duration, max_error_rate ) @@ -166,17 +168,23 @@ async def run( run_info, ) if iter_result is not None: - if iter_result.request_info.errored \ - and not iter_result.request_info.canceled \ - and self._is_max_error_rate_reached(iter_result.run_info): + if ( + iter_result.request_info.errored + and not iter_result.request_info.canceled + and self._is_max_error_rate_reached(iter_result.run_info) + ): if shutdown_event is None: - raise RuntimeError("We've reached max_error_rate " - "but shutdown_event is corrupt") + raise RuntimeError( + "We've reached max_error_rate " + "but shutdown_event is corrupt" + ) shutdown_event.set() max_error_rate_reached = True - logger.info(f"Max error rate of " - f"({iter_result.run_info.max_error_rate}) " - f"reached, sending shutdown signal") + logger.info( + f"Max error rate of " + f"({iter_result.run_info.max_error_rate}) " + f"reached, sending shutdown signal" + ) yield iter_result # yield control to the event loop @@ -192,14 +200,14 @@ async def run( await self._stop_processes(futures, requests_queue) def _validate_scheduler_params( - self, - scheduling_strategy: SchedulingStrategy, - max_duration: Optional[float], - max_error_rate: Optional[float], - max_number: Optional[int] + self, + scheduling_strategy: SchedulingStrategy, + max_duration: Optional[float], + max_error_rate: Optional[float], + max_number: Optional[int], ) -> None: if scheduling_strategy is None or not isinstance( - scheduling_strategy, SchedulingStrategy + scheduling_strategy, SchedulingStrategy ): raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}") if max_number is not None and max_number < 1: @@ -213,8 +221,10 @@ def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: if run_info.max_error_rate is None: return False current_error_rate = run_info.errored_requests / run_info.end_number - logger.info(f"Current error rate {current_error_rate} " - f"i.e total_finished [success / error] / max total possible") + logger.info( + f"Current error rate {current_error_rate} " + f"i.e total_finished [success / error] / max total possible" + ) return run_info.max_error_rate < current_error_rate async def _start_processes( @@ -222,12 +232,12 @@ async def _start_processes( manager, executor: ProcessPoolExecutor, scheduling_strategy: SchedulingStrategy, - create_shutdown_event: bool = False + create_shutdown_event: bool = False, ) -> tuple[ list[asyncio.Future], multiprocessing.Queue, multiprocessing.Queue, - Optional[MultiprocessingEvent] + Optional[MultiprocessingEvent], ]: await self.worker.prepare_multiprocessing() shutdown_event = manager.Event() if create_shutdown_event else None @@ -309,8 +319,10 @@ def _run_setup( ) if end_number == math.inf and max_error_rate is not None: - logger.warning("max_error_rate will be ignored " - "because end_number can not be determined.") + logger.warning( + "max_error_rate will be ignored " + "because end_number can not be determined." + ) if end_number == math.inf and end_time is None: logger.warning( @@ -324,16 +336,16 @@ def _run_setup( end_number=end_number, processes=len(processes), strategy=scheduling_strategy, - max_error_rate=max_error_rate + max_error_rate=max_error_rate, ) return info, requests_iter, times_iter def _determine_total_requests_count( - self, - scheduling_strategy: SchedulingStrategy, - max_duration: Optional[float], - max_number: Optional[int], + self, + scheduling_strategy: SchedulingStrategy, + max_duration: Optional[float], + max_number: Optional[int], ) -> Union[int, float]: end_number = max_number or math.inf try: diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index f37b7708..f80afb33 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -123,31 +123,10 @@ async def resolve( ... async def get_request( - self, requests_queue: multiprocessing.Queue, - shutdown_event: Optional[MultiprocessingEvent] = None, - process_id: Optional[int] = None, + self, + requests_queue: multiprocessing.Queue, ) -> Optional[WorkerProcessRequest[RequestT]]: - if shutdown_event is not None and process_id is None: - logger.warning("shutdown_event is not None and process_id " - "is None which makes it hard to debug") - - def _get_queue_intermittently(): - if shutdown_event is None: - raise ValueError("Shouldn't use _get_queue_intermittently " - "if there's no shutdown_even") - while True: - try: - get_timeout = timedelta(seconds=1).total_seconds() - return requests_queue.get(timeout=get_timeout) - except queue.Empty: - if shutdown_event.is_set(): - logger.info(f"Shutdown signal received in future {process_id}") - return None - - get_method = _get_queue_intermittently \ - if shutdown_event is not None \ - else requests_queue.get - return await asyncio.to_thread(get_method) # type: ignore[attr-defined] + return await asyncio.to_thread(requests_queue.get) # type: ignore[attr-defined] async def send_result( self, @@ -165,7 +144,6 @@ async def resolve_scheduler_request( timeout_time: float, results_queue: multiprocessing.Queue, process_id: int, - shutdown_event: Optional[MultiprocessingEvent] = None, ): info = SchedulerRequestInfo( targeted_start_time=start_time, @@ -174,39 +152,27 @@ async def resolve_scheduler_request( scheduled_time=time.time(), process_id=process_id, ) - request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = \ + request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = ( WorkerProcessResult( - type_="request_scheduled", - request=request, - response=None, - info=info, + type_="request_scheduled", + request=request, + response=None, + info=info, + ) ) asyncio.create_task(self.send_result(results_queue, request_scheduled_result)) if (wait_time := start_time - time.time()) > 0: - if shutdown_event is None: - await asyncio.sleep(wait_time) - else: - shutdown_signal_received = \ - await self._sleep_intermittently_until_timestamp_or_shutdown( - sleep_until_timestamp=start_time, - shutdown_event=shutdown_event, - ) - if shutdown_signal_received: - logger.info( - "Received shutdown signal " - "while waiting to start " - f"|| Process ID {process_id}" - ) - return + await asyncio.sleep(wait_time) info.worker_start = time.time() - request_start_result: WorkerProcessResult[RequestT, ResponseT] = \ + request_start_result: WorkerProcessResult[RequestT, ResponseT] = ( WorkerProcessResult( - type_="request_start", - request=request, - response=None, - info=info, + type_="request_start", + request=request, + response=None, + info=info, + ) ) asyncio.create_task(self.send_result(results_queue, request_start_result)) @@ -226,53 +192,57 @@ async def resolve_scheduler_request( ) asyncio.create_task(self.send_result(results_queue, result)) - async def _sleep_intermittently_until_timestamp_or_shutdown( + def run_process( self, - sleep_until_timestamp: float, - shutdown_event: MultiprocessingEvent, - ) -> bool: - delta = timedelta(seconds=10).total_seconds() - while time.time() < sleep_until_timestamp: - await asyncio.sleep(delta) - if shutdown_event.is_set(): - return True - return False - - def process_loop_synchronous( - self, - requests_queue: multiprocessing.Queue, - results_queue: multiprocessing.Queue, - process_id: int, - shutdown_event: Optional[MultiprocessingEvent] = None, + type_: Literal["synchronous", "asynchronous"], + requests_queue: multiprocessing.Queue, + results_queue: multiprocessing.Queue, + shutdown_event: multiprocessing.Event, + shutdown_poll_interval: float, + process_id: int, + max_concurrency: int, ): async def _process_runner(): - while ( - process_request := await self.get_request( + if type_ == "synchronous": + loop_task = asyncio.create_task(self._process_synchronous_requests_loop( requests_queue=requests_queue, - shutdown_event=shutdown_event, + results_queue=results_queue, process_id=process_id, - ) - ) is not None: - if shutdown_event and shutdown_event.is_set(): - logger.error("This shouldn't happen! " - "We should catch the " - "shutdown in the get wrapper") - logger.info(f"Shutdown signal received in future {process_id}") - break - - dequeued_time = time.time() - - await self.resolve_scheduler_request( - request=process_request.request, - queued_time=process_request.queued_time, - dequeued_time=dequeued_time, - start_time=process_request.start_time, - timeout_time=process_request.timeout_time, + ), name="request_loop_processor_task") + elif type_ == "asynchronous": + loop_task = asyncio.create_task(self._process_asynchronous_requests_loop( + requests_queue=requests_queue, results_queue=results_queue, + max_concurrency=max_concurrency, process_id=process_id, - shutdown_event=shutdown_event, - ) + ), name="request_loop_processor_task") + else: + raise ValueError(f"Invalid process type: {type_}") + + shutdown_task = asyncio.create_task( + self._wait_for_shutdown(shutdown_event, shutdown_poll_interval), + name="shutdown_task" + ) + done, pending = await asyncio.wait( + [ + loop_task, + shutdown_task, + ], + return_when=asyncio.FIRST_EXCEPTION, + ) + + for task in pending: + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + for task in done: + task_exception = task.exception() + if not isinstance(task_exception, asyncio.CancelledError): + raise task_exception try: asyncio.run(_process_runner()) except Exception as exc: # noqa: BLE001 @@ -281,78 +251,92 @@ async def _process_runner(): exc_info=True, stack_info=True, ) + finally: + shutdown_event.set() # ensure shutdown event is set to stop other processes - def process_loop_asynchronous( + async def _wait_for_shutdown( + self, + shutdown_event: MultiprocessingEvent, + shutdown_poll_interval: float, + ): + while not shutdown_event.is_set(): + await asyncio.sleep(shutdown_poll_interval) + + raise asyncio.CancelledError("Shutdown event set, cancelling process loop.") + + async def _process_synchronous_requests_loop( + self, + requests_queue: multiprocessing.Queue, + results_queue: multiprocessing.Queue, + process_id: int, + ): + while True: + process_request = await self.get_request( + requests_queue=requests_queue, + ) + + dequeued_time = time.time() + + await self.resolve_scheduler_request( + request=process_request.request, + queued_time=process_request.queued_time, + dequeued_time=dequeued_time, + start_time=process_request.start_time, + timeout_time=process_request.timeout_time, + results_queue=results_queue, + process_id=process_id, + ) + + async def _process_asynchronous_requests_loop( self, requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, - shutdown_event: Optional[MultiprocessingEvent] = None, ): - async def _process_runner(): - pending = asyncio.Semaphore(max_concurrency) + pending = asyncio.Semaphore(max_concurrency) - if pending.locked(): - raise ValueError("Async worker called with max_concurrency < 1") + if pending.locked(): + raise ValueError("Async worker called with max_concurrency < 1") - while ( - process_request := await self.get_request( - requests_queue=requests_queue, - shutdown_event=shutdown_event, - process_id=process_id) - ) is not None: - if shutdown_event and shutdown_event.is_set(): - logger.error("This shouldn't happen! " - "We should catch the " - "shutdown in the get wrapper") - logger.info(f"Shutdown signal received" - f" in future {process_id}") - break - - dequeued_time = time.time() - logger.debug(f"Dequeued Process ID {process_id} || " - f"Timestamp {dequeued_time} || " - f"Semaphore {pending._value}/{max_concurrency}") # noqa: SLF001 - - await pending.acquire() - - lock_acquired_at = time.time() - logger.debug(f"Lock acquired Process ID {process_id} ||" - f" Timestamp {lock_acquired_at} ||" - f" Semaphore {pending._value}/{max_concurrency}") # noqa: SLF001 - - def _task_done(_: asyncio.Task): - nonlocal pending - pending.release() - - if shutdown_event and shutdown_event.is_set(): - logger.info(f"Shutdown signal received in future {process_id}") - pending.release() - break - task = asyncio.create_task( - self.resolve_scheduler_request( - request=process_request.request, - queued_time=process_request.queued_time, - dequeued_time=dequeued_time, - start_time=process_request.start_time, - timeout_time=process_request.timeout_time, - results_queue=results_queue, - process_id=process_id, - shutdown_event=shutdown_event, - ) - ) - task.add_done_callback(_task_done) - await asyncio.sleep(0) # enable start task immediately + while True: + process_request = await self.get_request( + requests_queue=requests_queue, + ) - try: - asyncio.run(_process_runner()) - except Exception as exc: # noqa: BLE001 - logger.error( - f"Error in worker process {process_id}: {exc}", - exc_info=True, - stack_info=True, + dequeued_time = time.time() + logger.debug( + f"Dequeued Process ID {process_id} || " + f"Timestamp {dequeued_time} || " + f"Semaphore {pending._value}/{max_concurrency}" # noqa: SLF001 + ) + + await pending.acquire() + + lock_acquired_at = time.time() + logger.debug( + f"Lock acquired Process ID {process_id} ||" + f" Timestamp {lock_acquired_at} ||" + f" Semaphore {pending._value}/{max_concurrency}" # noqa: SLF001 + ) + + def _task_done(_: asyncio.Task): + nonlocal pending + pending.release() + + task = asyncio.create_task( + self.resolve_scheduler_request( + request=process_request.request, + queued_time=process_request.queued_time, + dequeued_time=dequeued_time, + start_time=process_request.start_time, + timeout_time=process_request.timeout_time, + results_queue=results_queue, + process_id=process_id, + ) ) + task.add_done_callback(_task_done) + await asyncio.sleep(0) # enable start task immediately class GenerativeRequestsWorkerDescription(WorkerDescription): @@ -405,7 +389,7 @@ def process_loop_synchronous( requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, process_id: int, - shutdown_event: Optional[MultiprocessingEvent] = None + shutdown_event: Optional[MultiprocessingEvent] = None, ): asyncio.run(self.backend.validate()) super().process_loop_synchronous( @@ -421,7 +405,7 @@ def process_loop_asynchronous( results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, - shutdown_event: Optional[MultiprocessingEvent] = None + shutdown_event: Optional[MultiprocessingEvent] = None, ): asyncio.run(self.backend.validate()) super().process_loop_asynchronous( From c134f66045167b64e9f36b074ebc5e894ea55519 Mon Sep 17 00:00:00 2001 From: markvaykhansky Date: Sun, 1 Jun 2025 15:12:20 +0300 Subject: [PATCH 25/37] WIP --- .pre-commit-config.yaml | 74 ++++++++++++++--------------- src/guidellm/scheduler/scheduler.py | 19 ++------ src/guidellm/scheduler/worker.py | 17 ++++--- 3 files changed, 52 insertions(+), 58 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e60e2899..9b9df3fd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,40 +4,40 @@ repos: hooks: - id: trailing-whitespace - id: end-of-file-fixer -- repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.7 - hooks: - - id: ruff -- repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.15.0 - hooks: - - id: mypy - args: [--check-untyped-defs] - additional_dependencies: - [ - # main dependencies - click, - datasets, - ftfy, - loguru, - numpy, - pillow, - pydantic, - pydantic_settings, - pyyaml, - respx, - rich, - setuptools, - setuptools-git-versioning, - transformers, - - # dev dependencies - pytest, - pydantic_settings, - - # types - types-click, - types-PyYAML, - types-requests, - types-toml, - ] +#- repo: https://github.com/astral-sh/ruff-pre-commit +# rev: v0.11.7 +# hooks: +# - id: ruff +#- repo: https://github.com/pre-commit/mirrors-mypy +# rev: v1.15.0 +# hooks: +# - id: mypy +# args: [--check-untyped-defs] +# additional_dependencies: +# [ +# # main dependencies +# click, +# datasets, +# ftfy, +# loguru, +# numpy, +# pillow, +# pydantic, +# pydantic_settings, +# pyyaml, +# respx, +# rich, +# setuptools, +# setuptools-git-versioning, +# transformers, +# +# # dev dependencies +# pytest, +# pydantic_settings, +# +# # types +# types-click, +# types-PyYAML, +# types-requests, +# types-toml, +# ] diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 102ebd69..0d27c94e 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -1,3 +1,4 @@ +from datetime import timedelta import asyncio import math import multiprocessing @@ -269,27 +270,17 @@ async def _start_processes( futures = [] loop = asyncio.get_event_loop() for id_, requests_limit in zip(process_ids, process_requests_limits): - if scheduling_strategy.processing_mode == "sync": + if scheduling_strategy.processing_mode in ["sync", "async"]: futures.append( loop.run_in_executor( executor, - self.worker.process_loop_synchronous, + self.worker.run_process, requests_queue, responses_queue, - id_, shutdown_event, - ) - ) - elif scheduling_strategy.processing_mode == "async": - futures.append( - loop.run_in_executor( - executor, - self.worker.process_loop_asynchronous, - requests_queue, - responses_queue, - requests_limit, + timedelta(seconds=10).total_seconds(), id_, - shutdown_event, + requests_limit, ) ) else: diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index f80afb33..b458224f 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -194,22 +194,25 @@ async def resolve_scheduler_request( def run_process( self, - type_: Literal["synchronous", "asynchronous"], + type_: Literal["sync", "async"], requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, shutdown_event: multiprocessing.Event, - shutdown_poll_interval: float, + shutdown_poll_interval_seconds: float, process_id: int, - max_concurrency: int, + max_concurrency: Optional[int] = None, ): async def _process_runner(): - if type_ == "synchronous": + if type_ == "sync": loop_task = asyncio.create_task(self._process_synchronous_requests_loop( requests_queue=requests_queue, results_queue=results_queue, process_id=process_id, ), name="request_loop_processor_task") - elif type_ == "asynchronous": + elif type_ == "async": + if max_concurrency is None: + raise ValueError("max_concurrency must be set " + "for async processor") loop_task = asyncio.create_task(self._process_asynchronous_requests_loop( requests_queue=requests_queue, results_queue=results_queue, @@ -220,8 +223,8 @@ async def _process_runner(): raise ValueError(f"Invalid process type: {type_}") shutdown_task = asyncio.create_task( - self._wait_for_shutdown(shutdown_event, shutdown_poll_interval), - name="shutdown_task" + self._wait_for_shutdown(shutdown_event, shutdown_poll_interval_seconds), + name="shutdown_task", ) done, pending = await asyncio.wait( From 464ebe359c05b711970a478712edb1869ce9c0ea Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Sun, 1 Jun 2025 15:27:43 +0300 Subject: [PATCH 26/37] WIP --- src/guidellm/scheduler/scheduler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 0d27c94e..859366f3 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -251,6 +251,7 @@ async def _start_processes( scheduling_strategy.processes_limit, scheduling_strategy.processing_requests_limit, ) + num_processes = 1 requests_limit_split = ( scheduling_strategy.processing_requests_limit // scheduling_strategy.processes_limit @@ -275,6 +276,7 @@ async def _start_processes( loop.run_in_executor( executor, self.worker.run_process, + scheduling_strategy.processing_mode, requests_queue, responses_queue, shutdown_event, From 883593aece54b4ad60d4e8846bd49a50402579a0 Mon Sep 17 00:00:00 2001 From: markvaykhansky Date: Tue, 3 Jun 2025 10:25:04 +0300 Subject: [PATCH 27/37] wip --- src/guidellm/backend/openai.py | 12 +-- src/guidellm/objects/pydantic.py | 10 +- src/guidellm/scheduler/repro.py | 138 ++++++++++++++++++++++++++++ src/guidellm/scheduler/scheduler.py | 8 +- src/guidellm/scheduler/worker.py | 18 +++- 5 files changed, 168 insertions(+), 18 deletions(-) create mode 100644 src/guidellm/scheduler/repro.py diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py index e3f23963..5aec53fa 100644 --- a/src/guidellm/backend/openai.py +++ b/src/guidellm/backend/openai.py @@ -572,12 +572,12 @@ async def _iterative_completions_request( async for line in stream.aiter_lines(): iter_time = time.time() - logger.debug( - "{} request: {} recieved iter response line: {}", - self.__class__.__name__, - request_id, - line, - ) + # logger.debug( + # "{} request: {} recieved iter response line: {}", + # self.__class__.__name__, + # request_id, + # line, + # ) if not line or not line.strip().startswith("data:"): continue diff --git a/src/guidellm/objects/pydantic.py b/src/guidellm/objects/pydantic.py index 8365be33..92658e17 100644 --- a/src/guidellm/objects/pydantic.py +++ b/src/guidellm/objects/pydantic.py @@ -21,11 +21,11 @@ class StandardBaseModel(BaseModel): def __init__(self, /, **data: Any) -> None: super().__init__(**data) - logger.debug( - "Initialized new instance of {} with data: {}", - self.__class__.__name__, - data, - ) + # logger.debug( + # "Initialized new instance of {} with data: {}", + # self.__class__.__name__, + # data, + # ) SuccessfulT = TypeVar("SuccessfulT") diff --git a/src/guidellm/scheduler/repro.py b/src/guidellm/scheduler/repro.py new file mode 100644 index 00000000..f9f76830 --- /dev/null +++ b/src/guidellm/scheduler/repro.py @@ -0,0 +1,138 @@ +import asyncio +import multiprocessing +import time +import logging +import threading + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s', + datefmt='%H:%M:%S' +) + +# A multiprocessing queue that will remain empty +# Naming it mp_queue to distinguish from asyncio.Queue +mp_queue = multiprocessing.Queue() + + +async def get_item_from_mp_queue(q: multiprocessing.Queue, worker_id: int): + """ + Coroutine that tries to get an item from a multiprocessing.Queue + using asyncio.to_thread. + """ + logging.info(f"Worker {worker_id}: get_item_from_mp_queue: ENTERED. Awaiting asyncio.to_thread(q.get).") + try: + # This is the blocking call in a separate thread + item = await asyncio.to_thread(q.get) + # We don't expect this to be reached if the queue is empty + logging.info( + f"Worker {worker_id}: get_item_from_mp_queue: asyncio.to_thread RETURNED NORMALLY with item: {item}.") + return item + except asyncio.CancelledError: + # This is where it SHOULD go if the task awaiting this coroutine is cancelled, + # and asyncio.to_thread correctly propagates the cancellation to its awaiter. + logging.error( + f"Worker {worker_id}: get_item_from_mp_queue: CAUGHT CancelledError from asyncio.to_thread directly!") + raise # Re-raise to propagate the cancellation + except Exception as e: + logging.error(f"Worker {worker_id}: get_item_from_mp_queue: CAUGHT an UNEXPECTED EXCEPTION {type(e)}: {e}", + exc_info=True) + raise + finally: + # This finally block will execute. The key is whether the CancelledError was caught above. + logging.info(f"Worker {worker_id}: get_item_from_mp_queue: EXITED (finally block).") + + +async def worker_coroutine(worker_id: int, q: multiprocessing.Queue): + """ + The main coroutine for our worker task. It will try to get an item + from the queue. + """ + logging.info(f"Worker {worker_id}: worker_coroutine: STARTED.") + try: + logging.info(f"Worker {worker_id}: worker_coroutine: About to await get_item_from_mp_queue.") + # This is the await point where CancelledError should be injected + # if this worker_coroutine task is cancelled. + await get_item_from_mp_queue(q, worker_id) + logging.info(f"Worker {worker_id}: worker_coroutine: get_item_from_mp_queue completed (unexpectedly).") + except asyncio.CancelledError: + logging.error(f"Worker {worker_id}: worker_coroutine: SUCCESSFULLY CAUGHT CancelledError.") + # Perform any task-specific cleanup here if needed + except Exception as e: + logging.error(f"Worker {worker_id}: worker_coroutine: CAUGHT UNEXPECTED EXCEPTION {type(e)}: {e}", + exc_info=True) + finally: + logging.info(f"Worker {worker_id}: worker_coroutine: FINISHED (finally block).") + + +async def main_orchestrator(): + """ + Orchestrates the test: creates, runs, and cancels the worker. + """ + logging.info("Main Orchestrator: Starting worker task.") + worker_task = asyncio.create_task(worker_coroutine(1, mp_queue), name="WorkerCoroutine-1") + + # Give the worker task a moment to start and block on the queue + logging.info("Main Orchestrator: Sleeping for 1 second to let worker block...") + await asyncio.sleep(1) + + logging.info(f"Main Orchestrator: Current active threads: {[t.name for t_ in threading.enumerate()]}...") + + # Cancel the worker task + print("Main Orchestrator: Cancelling worker_task...") + worker_task.cancel() + + # Wait for the worker task to finish, with a timeout. + # If cancellation works as expected, worker_task should complete (by handling CancelledError) + # well before the timeout. + # If it gets stuck, asyncio.TimeoutError will be raised. + timeout_seconds = 5.0 + logging.info(f"Main Orchestrator: Awaiting worker_task with timeout {timeout_seconds}s...") + try: + await asyncio.wait_for(worker_task, timeout=timeout_seconds) + logging.info("Main Orchestrator: worker_task completed WITHOUT timeout.") + except asyncio.TimeoutError: + logging.error( + f"Main Orchestrator: TIMEOUT! worker_task did not finish within {timeout_seconds}s after cancellation.") + logging.error( + f"Main Orchestrator: worker_task.done() = {worker_task.done()}, worker_task.cancelled() = {worker_task.cancelled()}") + # At this point, the thread running mp_queue.get() is likely still blocked. + except asyncio.CancelledError: + # This would happen if main_orchestrator itself was cancelled, not expected here. + logging.error("Main Orchestrator: main_orchestrator itself was cancelled (unexpected).") + except Exception as e: + logging.error(f"Main Orchestrator: An unexpected error occurred while waiting for worker_task: {e}", + exc_info=True) + finally: + logging.info("Main Orchestrator: Test finished.") + # Note: The thread started by asyncio.to_thread for mp_queue.get() + # might still be alive and blocked if q.get() wasn't unblocked. + # It's a daemon thread by default, so it won't prevent program exit. + # To clean it up, one would typically put a sentinel into mp_queue. + # For this test, we are focused on the asyncio task cancellation. + logging.info( + f"Main Orchestrator: Final check: worker_task.done() = {worker_task.done()}, worker_task.cancelled() = {worker_task.cancelled()}") + + # Attempt to unblock the queue to allow the thread to exit, + # though the test's focus is on the asyncio cancellation. + try: + mp_queue.put_nowait(None) # Sentinel + logging.info("Main Orchestrator: Put sentinel in mp_queue to unblock thread.") + except Exception: + logging.warning("Main Orchestrator: Could not put sentinel in mp_queue.") + + +if __name__ == "__main__": + # For multiprocessing queues to work correctly, especially on Windows/macOS + # with 'spawn' or 'forkserver' start methods, it's good practice + # to ensure the queue is created in the main process scope before tasks. + # In this simple script, it's fine. + try: + asyncio.run(main_orchestrator()) + except KeyboardInterrupt: + logging.info("Main Orchestrator: Keyboard interrupt received.") + finally: + mp_queue.close() + mp_queue.join_thread() # Ensure queue's feeder thread is joined + logging.info("Main Orchestrator: mp_queue resources released.") diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 859366f3..ad822036 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -154,6 +154,7 @@ async def run( ): # we've exhausted all requests we've wanted to run # and yielded all responses + logger.info("run_info.completed_requests >= run_info.created_requests") break requests_iter = self._add_requests( @@ -198,7 +199,7 @@ async def run( run_info=run_info, ) - await self._stop_processes(futures, requests_queue) + await self._stop_processes(futures, shutdown_event, requests_queue) def _validate_scheduler_params( self, @@ -457,10 +458,9 @@ def _check_result_ready( async def _stop_processes( self, futures: list[asyncio.Future], + shutdown_event: MultiprocessingEvent, requests_queue: multiprocessing.Queue, ): - for _ in futures: - requests_queue.put(None) - + shutdown_event.set() logger.debug("Waiting for futures to shut down") await asyncio.gather(*futures) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index b458224f..38cfecbd 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -223,7 +223,10 @@ async def _process_runner(): raise ValueError(f"Invalid process type: {type_}") shutdown_task = asyncio.create_task( - self._wait_for_shutdown(shutdown_event, shutdown_poll_interval_seconds), + self._wait_for_shutdown( + shutdown_event=shutdown_event, + shutdown_poll_interval=shutdown_poll_interval_seconds + ), name="shutdown_task", ) @@ -236,7 +239,9 @@ async def _process_runner(): ) for task in pending: - task.cancel() + logger.debug(f"Cancelling task {task.get_name()}") + cancel_result = task.cancel() + logger.debug(f"{'Task is already done or canceled' if not cancel_result else 'sent cancel signal'}") try: await task except asyncio.CancelledError: @@ -265,6 +270,8 @@ async def _wait_for_shutdown( while not shutdown_event.is_set(): await asyncio.sleep(shutdown_poll_interval) + logger.debug("Shutdown signal received") + raise ValueError("kaki") raise asyncio.CancelledError("Shutdown event set, cancelling process loop.") async def _process_synchronous_requests_loop( @@ -290,6 +297,9 @@ async def _process_synchronous_requests_loop( process_id=process_id, ) + logger.debug("Done processing synchronous loop") + + async def _process_asynchronous_requests_loop( self, requests_queue: multiprocessing.Queue, @@ -303,6 +313,7 @@ async def _process_asynchronous_requests_loop( raise ValueError("Async worker called with max_concurrency < 1") while True: + logger.info("Awaiting request...") process_request = await self.get_request( requests_queue=requests_queue, ) @@ -315,7 +326,6 @@ async def _process_asynchronous_requests_loop( ) await pending.acquire() - lock_acquired_at = time.time() logger.debug( f"Lock acquired Process ID {process_id} ||" @@ -341,6 +351,8 @@ def _task_done(_: asyncio.Task): task.add_done_callback(_task_done) await asyncio.sleep(0) # enable start task immediately + logger.debug("Done processing asynchronous loop") + class GenerativeRequestsWorkerDescription(WorkerDescription): type_: Literal["generative_requests_worker"] = "generative_requests_worker" # type: ignore[assignment] From 35abac72643ee816504d2431fa48fe1114866b2c Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 4 Jun 2025 09:30:48 +0300 Subject: [PATCH 28/37] WIP - Stuck after shutdown signal received --- src/guidellm/scheduler/worker.py | 54 ++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index 38cfecbd..9b6f283d 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -123,10 +123,34 @@ async def resolve( ... async def get_request( - self, - requests_queue: multiprocessing.Queue, + self, requests_queue: multiprocessing.Queue, + shutdown_event: MultiprocessingEvent, + process_id: int, + shutdown_poll_interval_seconds: float, ) -> Optional[WorkerProcessRequest[RequestT]]: - return await asyncio.to_thread(requests_queue.get) # type: ignore[attr-defined] + # We need to check shutdown_event intermittently cause + # if we simply use asyncio.to_thread(requests_queue.get) + # the cancellation task doesn't propagate because the + # asyncio.to_thread is blocking + return await asyncio.to_thread(requests_queue.get) + # def _get_queue_intermittently(): + # while True: + # try: + # return requests_queue.get(timeout=shutdown_poll_interval_seconds) + # except queue.Empty: + # logger.info("Checking shutdown even is set in get_request") + # if shutdown_event.is_set(): + # logger.info(f"Shutdown signal received in future {process_id}") + # raise asyncio.CancelledError() + # # return None + # + # try: + # return await asyncio.to_thread(_get_queue_intermittently) # type: ignore[attr-defined] + # except asyncio.CancelledError: + # logger.info("kaki") + # # return None + # raise + # # raise async def send_result( self, @@ -203,11 +227,15 @@ def run_process( max_concurrency: Optional[int] = None, ): async def _process_runner(): + import threading + internal_shutdown_event = threading.Event() if type_ == "sync": loop_task = asyncio.create_task(self._process_synchronous_requests_loop( requests_queue=requests_queue, results_queue=results_queue, process_id=process_id, + shutdown_event=internal_shutdown_event, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, ), name="request_loop_processor_task") elif type_ == "async": if max_concurrency is None: @@ -218,6 +246,8 @@ async def _process_runner(): results_queue=results_queue, max_concurrency=max_concurrency, process_id=process_id, + shutdown_event=internal_shutdown_event, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, ), name="request_loop_processor_task") else: raise ValueError(f"Invalid process type: {type_}") @@ -237,10 +267,12 @@ async def _process_runner(): ], return_when=asyncio.FIRST_EXCEPTION, ) + logger.info("First exception happened") for task in pending: logger.debug(f"Cancelling task {task.get_name()}") cancel_result = task.cancel() + internal_shutdown_event.set() logger.debug(f"{'Task is already done or canceled' if not cancel_result else 'sent cancel signal'}") try: await task @@ -271,7 +303,6 @@ async def _wait_for_shutdown( await asyncio.sleep(shutdown_poll_interval) logger.debug("Shutdown signal received") - raise ValueError("kaki") raise asyncio.CancelledError("Shutdown event set, cancelling process loop.") async def _process_synchronous_requests_loop( @@ -279,10 +310,15 @@ async def _process_synchronous_requests_loop( requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, process_id: int, + shutdown_event: MultiprocessingEvent, + shutdown_poll_interval_seconds: float, ): while True: process_request = await self.get_request( requests_queue=requests_queue, + shutdown_event=shutdown_event, + process_id=process_id, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds ) dequeued_time = time.time() @@ -297,15 +333,14 @@ async def _process_synchronous_requests_loop( process_id=process_id, ) - logger.debug("Done processing synchronous loop") - - async def _process_asynchronous_requests_loop( self, requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, + shutdown_event: MultiprocessingEvent, + shutdown_poll_interval_seconds: float, ): pending = asyncio.Semaphore(max_concurrency) @@ -316,6 +351,9 @@ async def _process_asynchronous_requests_loop( logger.info("Awaiting request...") process_request = await self.get_request( requests_queue=requests_queue, + shutdown_event=shutdown_event, + process_id=process_id, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, ) dequeued_time = time.time() @@ -351,8 +389,6 @@ def _task_done(_: asyncio.Task): task.add_done_callback(_task_done) await asyncio.sleep(0) # enable start task immediately - logger.debug("Done processing asynchronous loop") - class GenerativeRequestsWorkerDescription(WorkerDescription): type_: Literal["generative_requests_worker"] = "generative_requests_worker" # type: ignore[assignment] From 55cf7187447dd7687502cbebabdd34633f252c54 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 4 Jun 2025 13:41:45 +0300 Subject: [PATCH 29/37] WIP - New cancellation mechanism works --- src/guidellm/benchmark/aggregator.py | 2 +- src/guidellm/scheduler/scheduler.py | 12 ++----- src/guidellm/scheduler/worker.py | 48 +++++++++++++++------------- 3 files changed, 30 insertions(+), 32 deletions(-) diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py index cd725326..a17f642f 100644 --- a/src/guidellm/benchmark/aggregator.py +++ b/src/guidellm/benchmark/aggregator.py @@ -638,7 +638,7 @@ def _calculate_error_rate(self) -> float: total_successful = self.requests_stats.totals.successful.total total_errored = self.requests_stats.totals.errored.total total_finished = total_errored + total_successful - return total_errored / total_finished + return total_errored / total_finished if total_finished > 0 else 0 def _compile_results( self, diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index ad822036..628272c1 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -175,11 +175,6 @@ async def run( and not iter_result.request_info.canceled and self._is_max_error_rate_reached(iter_result.run_info) ): - if shutdown_event is None: - raise RuntimeError( - "We've reached max_error_rate " - "but shutdown_event is corrupt" - ) shutdown_event.set() max_error_rate_reached = True logger.info( @@ -199,7 +194,7 @@ async def run( run_info=run_info, ) - await self._stop_processes(futures, shutdown_event, requests_queue) + await self._stop_processes(futures, shutdown_event) def _validate_scheduler_params( self, @@ -252,7 +247,6 @@ async def _start_processes( scheduling_strategy.processes_limit, scheduling_strategy.processing_requests_limit, ) - num_processes = 1 requests_limit_split = ( scheduling_strategy.processing_requests_limit // scheduling_strategy.processes_limit @@ -459,8 +453,8 @@ async def _stop_processes( self, futures: list[asyncio.Future], shutdown_event: MultiprocessingEvent, - requests_queue: multiprocessing.Queue, ): - shutdown_event.set() + if not shutdown_event.is_set(): + shutdown_event.set() logger.debug("Waiting for futures to shut down") await asyncio.gather(*futures) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index 9b6f283d..1b82c335 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -42,6 +42,10 @@ ] +class ShutdownSignalReceived(Exception): + pass + + @dataclass class WorkerProcessRequest(Generic[RequestT]): request: RequestT @@ -132,25 +136,24 @@ async def get_request( # if we simply use asyncio.to_thread(requests_queue.get) # the cancellation task doesn't propagate because the # asyncio.to_thread is blocking - return await asyncio.to_thread(requests_queue.get) - # def _get_queue_intermittently(): - # while True: - # try: - # return requests_queue.get(timeout=shutdown_poll_interval_seconds) - # except queue.Empty: - # logger.info("Checking shutdown even is set in get_request") - # if shutdown_event.is_set(): - # logger.info(f"Shutdown signal received in future {process_id}") - # raise asyncio.CancelledError() - # # return None - # - # try: - # return await asyncio.to_thread(_get_queue_intermittently) # type: ignore[attr-defined] - # except asyncio.CancelledError: - # logger.info("kaki") - # # return None - # raise - # # raise + def _get_queue_intermittently(): + while True: + try: + return requests_queue.get(timeout=shutdown_poll_interval_seconds) + except queue.Empty: + logger.info("Checking shutdown even is set in get_request") + if shutdown_event.is_set(): + logger.info(f"Shutdown signal received in future {process_id}") + raise asyncio.CancelledError() + # return None + + try: + return await asyncio.to_thread(_get_queue_intermittently) # type: ignore[attr-defined] + except asyncio.CancelledError: + logger.info("kaki") + # return None + raise + # raise async def send_result( self, @@ -267,7 +270,7 @@ async def _process_runner(): ], return_when=asyncio.FIRST_EXCEPTION, ) - logger.info("First exception happened") + logger.info(f"First exception happened, done: [{[r.get_name() for r in done]}") for task in pending: logger.debug(f"Cancelling task {task.get_name()}") @@ -281,7 +284,7 @@ async def _process_runner(): for task in done: task_exception = task.exception() - if not isinstance(task_exception, asyncio.CancelledError): + if not isinstance(task_exception, ShutdownSignalReceived): raise task_exception try: asyncio.run(_process_runner()) @@ -303,7 +306,8 @@ async def _wait_for_shutdown( await asyncio.sleep(shutdown_poll_interval) logger.debug("Shutdown signal received") - raise asyncio.CancelledError("Shutdown event set, cancelling process loop.") + raise ShutdownSignalReceived("Shutdown event set, cancelling process loop.") + # raise asyncio.CancelledError("Shutdown event set, cancelling process loop.") async def _process_synchronous_requests_loop( self, From 1bc8f9aec84fb5b84fd0970d0747a109fcaa5646 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Wed, 4 Jun 2025 13:53:57 +0300 Subject: [PATCH 30/37] WIP - Minor fixes --- src/guidellm/scheduler/scheduler.py | 2 +- src/guidellm/scheduler/worker.py | 16 ++++------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 628272c1..46396fde 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -218,7 +218,7 @@ def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: if run_info.max_error_rate is None: return False current_error_rate = run_info.errored_requests / run_info.end_number - logger.info( + logger.debug( f"Current error rate {current_error_rate} " f"i.e total_finished [success / error] / max total possible" ) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index 1b82c335..ce875409 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -145,15 +145,7 @@ def _get_queue_intermittently(): if shutdown_event.is_set(): logger.info(f"Shutdown signal received in future {process_id}") raise asyncio.CancelledError() - # return None - - try: - return await asyncio.to_thread(_get_queue_intermittently) # type: ignore[attr-defined] - except asyncio.CancelledError: - logger.info("kaki") - # return None - raise - # raise + return await asyncio.to_thread(_get_queue_intermittently) # type: ignore[attr-defined] async def send_result( self, @@ -305,9 +297,10 @@ async def _wait_for_shutdown( while not shutdown_event.is_set(): await asyncio.sleep(shutdown_poll_interval) - logger.debug("Shutdown signal received") + # Raising asyncio.CancelledError instead would + # cause the asyncio.wait above to wait + # forever, couldn't find a reasonable reason why raise ShutdownSignalReceived("Shutdown event set, cancelling process loop.") - # raise asyncio.CancelledError("Shutdown event set, cancelling process loop.") async def _process_synchronous_requests_loop( self, @@ -352,7 +345,6 @@ async def _process_asynchronous_requests_loop( raise ValueError("Async worker called with max_concurrency < 1") while True: - logger.info("Awaiting request...") process_request = await self.get_request( requests_queue=requests_queue, shutdown_event=shutdown_event, From 99457108fea83087dc0ba662c9ac7e8d896afe1b Mon Sep 17 00:00:00 2001 From: markvaykhansky Date: Wed, 4 Jun 2025 15:42:28 +0300 Subject: [PATCH 31/37] Add shutdown check interval to settings --- src/guidellm/config.py | 1 + src/guidellm/scheduler/scheduler.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/guidellm/config.py b/src/guidellm/config.py index ed7e782b..f137d52b 100644 --- a/src/guidellm/config.py +++ b/src/guidellm/config.py @@ -113,6 +113,7 @@ class Settings(BaseSettings): default_async_loop_sleep: float = 10e-5 logging: LoggingSettings = LoggingSettings() default_sweep_number: int = 10 + shutdown_poll_interval_seconds: float = 10 # HTTP settings request_follow_redirects: bool = True diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 46396fde..e5a44a66 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -275,7 +275,7 @@ async def _start_processes( requests_queue, responses_queue, shutdown_event, - timedelta(seconds=10).total_seconds(), + settings.shutdown_poll_interval_seconds, id_, requests_limit, ) From f11da242fdab41621a33cf179bbe7d67cb32ec36 Mon Sep 17 00:00:00 2001 From: markvaykhansky Date: Thu, 5 Jun 2025 07:49:34 +0300 Subject: [PATCH 32/37] WIP - Support more rate types --- src/guidellm/config.py | 1 + src/guidellm/scheduler/result.py | 5 +++ src/guidellm/scheduler/scheduler.py | 47 +++++++++++++++++++++-------- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/guidellm/config.py b/src/guidellm/config.py index f137d52b..cc79b6e3 100644 --- a/src/guidellm/config.py +++ b/src/guidellm/config.py @@ -114,6 +114,7 @@ class Settings(BaseSettings): logging: LoggingSettings = LoggingSettings() default_sweep_number: int = 10 shutdown_poll_interval_seconds: float = 10 + constant_error_check_window_size = 100 # HTTP settings request_follow_redirects: bool = True diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py index f899f54a..990a4138 100644 --- a/src/guidellm/scheduler/result.py +++ b/src/guidellm/scheduler/result.py @@ -1,3 +1,4 @@ +from collections import deque from typing import ( Generic, Literal, @@ -16,6 +17,8 @@ ] +RequestStatus = Literal["success" | "error"] + class SchedulerRunInfo(StandardBaseModel): """ Information about the current run of the scheduler. @@ -55,6 +58,8 @@ class SchedulerRunInfo(StandardBaseModel): completed_requests: int = 0 errored_requests: int = 0 + last_requests_statuses: Optional[deque[RequestStatus]] = None + class SchedulerRequestInfo(StandardBaseModel): """ diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index e5a44a66..4345f550 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -1,3 +1,4 @@ +import collections from datetime import timedelta import asyncio import math @@ -128,10 +129,11 @@ async def run( responses_queue, shutdown_event, ) = await self._start_processes( - manager, executor, scheduling_strategy, max_error_rate is not None + manager, executor, scheduling_strategy ) - if shutdown_event and shutdown_event.is_set(): + if shutdown_event.is_set(): raise RuntimeError("shutdown_event is set before starting scheduling") + run_info, requests_iter, times_iter = self._run_setup( futures, scheduling_strategy, max_number, max_duration, max_error_rate ) @@ -217,27 +219,42 @@ def _validate_scheduler_params( def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: if run_info.max_error_rate is None: return False - current_error_rate = run_info.errored_requests / run_info.end_number - logger.debug( - f"Current error rate {current_error_rate} " - f"i.e total_finished [success / error] / max total possible" - ) - return run_info.max_error_rate < current_error_rate + + is_max_error_rate = run_info.max_error_rate < 1 + if not is_max_error_rate: + # Constant value + raise NotImplementedError() + if( + run_info.strategy.type_ == "constant" + and run_info.end_number != math.inf + ): + # We know how many requests + current_error_rate = run_info.errored_requests / run_info.end_number + logger.debug( + f"Current error rate {current_error_rate} " + f"i.e total_finished [success / error] / max total possible" + ) + return run_info.max_error_rate < current_error_rate + elif settings.constant_error_check_window_size <= run_info.completed_requests: + # Calculate deque ratio or success to erorr + if run_info.last_requests_statuses is None: + raise RuntimeError("") + return + return False async def _start_processes( self, manager, executor: ProcessPoolExecutor, scheduling_strategy: SchedulingStrategy, - create_shutdown_event: bool = False, ) -> tuple[ list[asyncio.Future], multiprocessing.Queue, multiprocessing.Queue, - Optional[MultiprocessingEvent], + MultiprocessingEvent, ]: await self.worker.prepare_multiprocessing() - shutdown_event = manager.Event() if create_shutdown_event else None + shutdown_event = manager.Event() requests_queue = manager.Queue( maxsize=scheduling_strategy.queued_requests_limit ) @@ -325,6 +342,7 @@ def _run_setup( processes=len(processes), strategy=scheduling_strategy, max_error_rate=max_error_rate, + last_requests_statuses = collections.deque(maxlen=settings.constant_error_check_window_size) if max_error_rate > 1 else None ) return info, requests_iter, times_iter @@ -437,9 +455,14 @@ def _check_result_ready( run_info.processing_requests -= 1 run_info.completed_requests += 1 - if process_response.info.errored: + is_errored = process_response.info.errored + if is_errored: run_info.errored_requests += 1 + if run_info.last_requests_statuses: + status = "error" if is_errored else "success" + run_info.last_requests_statuses.append(status) + return SchedulerRequestResult( type_="request_complete", run_info=run_info, From 6c6c15ac89d5c5c566627ba2d1d7d5aff0533858 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Thu, 5 Jun 2025 09:24:12 +0300 Subject: [PATCH 33/37] Support more rate type as well as const error count value --- src/guidellm/benchmark/benchmarker.py | 1 - src/guidellm/config.py | 2 +- src/guidellm/scheduler/result.py | 3 +- src/guidellm/scheduler/scheduler.py | 67 ++++++++++++++++----------- 4 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py index ecb721f7..dddcadb5 100644 --- a/src/guidellm/benchmark/benchmarker.py +++ b/src/guidellm/benchmark/benchmarker.py @@ -77,7 +77,6 @@ class BenchmarkerStrategyLimits(StandardBaseModel): max_error_rate: Optional[float] = Field( description="Maximum error rate after which a benchmark will stop", ge=0, - le=1, ) warmup_percent_per_strategy: Optional[float] = Field( description="Percentage of requests to use for warmup.", diff --git a/src/guidellm/config.py b/src/guidellm/config.py index cc79b6e3..b5b993d3 100644 --- a/src/guidellm/config.py +++ b/src/guidellm/config.py @@ -114,7 +114,7 @@ class Settings(BaseSettings): logging: LoggingSettings = LoggingSettings() default_sweep_number: int = 10 shutdown_poll_interval_seconds: float = 10 - constant_error_check_window_size = 100 + error_check_window_size: int = 10 # HTTP settings request_follow_redirects: bool = True diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py index 990a4138..4bf15971 100644 --- a/src/guidellm/scheduler/result.py +++ b/src/guidellm/scheduler/result.py @@ -17,7 +17,8 @@ ] -RequestStatus = Literal["success" | "error"] +RequestStatus = Literal["success", "error"] + class SchedulerRunInfo(StandardBaseModel): """ diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 4345f550..f6129d14 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -12,7 +12,7 @@ Any, Generic, Optional, - Union, + Union, Literal, cast, ) from loguru import logger @@ -213,33 +213,48 @@ def _validate_scheduler_params( raise ValueError(f"Invalid max_number: {max_number}") if max_duration is not None and max_duration < 0: raise ValueError(f"Invalid max_duration: {max_duration}") - if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1): + if max_error_rate is not None and (max_error_rate < 0): raise ValueError(f"Invalid max_error_rate: {max_error_rate}") def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: - if run_info.max_error_rate is None: + max_error = run_info.max_error_rate + if max_error is None: return False - is_max_error_rate = run_info.max_error_rate < 1 - if not is_max_error_rate: - # Constant value - raise NotImplementedError() - if( + if not max_error < 1: + # Absolute error count, i.e not a ratio + logger.debug( + f"Current error count " + f"{run_info.errored_requests} / " + f"{max_error} (max error)" + ) + return max_error < run_info.errored_requests + elif( run_info.strategy.type_ == "constant" and run_info.end_number != math.inf ): - # We know how many requests - current_error_rate = run_info.errored_requests / run_info.end_number + current_error_ratio = run_info.errored_requests / run_info.end_number logger.debug( - f"Current error rate {current_error_rate} " + f"Current error rate {current_error_ratio} " f"i.e total_finished [success / error] / max total possible" ) - return run_info.max_error_rate < current_error_rate - elif settings.constant_error_check_window_size <= run_info.completed_requests: - # Calculate deque ratio or success to erorr - if run_info.last_requests_statuses is None: - raise RuntimeError("") - return + return max_error < current_error_ratio + elif settings.error_check_window_size <= run_info.completed_requests: + last_requests_statuses = run_info.last_requests_statuses + last_errored_requests_count = len([ + s + for s + in last_requests_statuses + if s == "error" + ]) + current_error_ratio = last_errored_requests_count / len(last_requests_statuses) + logger.debug( + f"Current error rate in " + f"last requests window is " + f"{current_error_ratio} / {max_error} " + f"(max error rate)" + ) + return max_error < current_error_ratio return False async def _start_processes( @@ -323,12 +338,6 @@ def _run_setup( scheduling_strategy, max_duration, max_number ) - if end_number == math.inf and max_error_rate is not None: - logger.warning( - "max_error_rate will be ignored " - "because end_number can not be determined." - ) - if end_number == math.inf and end_time is None: logger.warning( "No end number or end time set, " @@ -342,7 +351,9 @@ def _run_setup( processes=len(processes), strategy=scheduling_strategy, max_error_rate=max_error_rate, - last_requests_statuses = collections.deque(maxlen=settings.constant_error_check_window_size) if max_error_rate > 1 else None + last_requests_statuses=collections.deque( + maxlen=settings.error_check_window_size + ) ) return info, requests_iter, times_iter @@ -459,9 +470,11 @@ def _check_result_ready( if is_errored: run_info.errored_requests += 1 - if run_info.last_requests_statuses: - status = "error" if is_errored else "success" - run_info.last_requests_statuses.append(status) + request_status: Literal["error", "success"] = cast( + Literal["error", "success"], + "error" if is_errored else "success" + ) + run_info.last_requests_statuses.append(request_status) return SchedulerRequestResult( type_="request_complete", From 039db66f8401fa1a04d7ad1cbaacd62a6a18ccc7 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Thu, 5 Jun 2025 11:54:14 +0300 Subject: [PATCH 34/37] style + type fixes --- src/guidellm/backend/openai.py | 14 +-- src/guidellm/objects/pydantic.py | 13 ++- src/guidellm/scheduler/repro.py | 138 ----------------------- src/guidellm/scheduler/result.py | 3 +- src/guidellm/scheduler/scheduler.py | 35 +++--- src/guidellm/scheduler/worker.py | 165 +++++++++++++++------------- 6 files changed, 117 insertions(+), 251 deletions(-) delete mode 100644 src/guidellm/scheduler/repro.py diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py index 5aec53fa..5c416e67 100644 --- a/src/guidellm/backend/openai.py +++ b/src/guidellm/backend/openai.py @@ -93,7 +93,7 @@ def __init__( raise ValueError("Target URL must be provided for OpenAI HTTP backend.") if self._target.endswith("/v1") or self._target.endswith("/v1/"): - # backwards compatability, strip v1 off + # backwards compatibility, strip v1 off self._target = self._target[:-3] if self._target.endswith("/"): @@ -572,12 +572,12 @@ async def _iterative_completions_request( async for line in stream.aiter_lines(): iter_time = time.time() - # logger.debug( - # "{} request: {} recieved iter response line: {}", - # self.__class__.__name__, - # request_id, - # line, - # ) + logger.debug( + "{} request: {} recieved iter response line: {}", + self.__class__.__name__, + request_id, + line, + ) if not line or not line.strip().startswith("data:"): continue diff --git a/src/guidellm/objects/pydantic.py b/src/guidellm/objects/pydantic.py index 92658e17..3936d690 100644 --- a/src/guidellm/objects/pydantic.py +++ b/src/guidellm/objects/pydantic.py @@ -1,10 +1,11 @@ from typing import Any, Generic, TypeVar -from loguru import logger from pydantic import BaseModel, ConfigDict, Field __all__ = ["StandardBaseModel", "StatusBreakdown"] +from guidellm import logger + class StandardBaseModel(BaseModel): """ @@ -21,11 +22,11 @@ class StandardBaseModel(BaseModel): def __init__(self, /, **data: Any) -> None: super().__init__(**data) - # logger.debug( - # "Initialized new instance of {} with data: {}", - # self.__class__.__name__, - # data, - # ) + logger.debug( + "Initialized new instance of {} with data: {}", + self.__class__.__name__, + data, + ) SuccessfulT = TypeVar("SuccessfulT") diff --git a/src/guidellm/scheduler/repro.py b/src/guidellm/scheduler/repro.py deleted file mode 100644 index f9f76830..00000000 --- a/src/guidellm/scheduler/repro.py +++ /dev/null @@ -1,138 +0,0 @@ -import asyncio -import multiprocessing -import time -import logging -import threading - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s', - datefmt='%H:%M:%S' -) - -# A multiprocessing queue that will remain empty -# Naming it mp_queue to distinguish from asyncio.Queue -mp_queue = multiprocessing.Queue() - - -async def get_item_from_mp_queue(q: multiprocessing.Queue, worker_id: int): - """ - Coroutine that tries to get an item from a multiprocessing.Queue - using asyncio.to_thread. - """ - logging.info(f"Worker {worker_id}: get_item_from_mp_queue: ENTERED. Awaiting asyncio.to_thread(q.get).") - try: - # This is the blocking call in a separate thread - item = await asyncio.to_thread(q.get) - # We don't expect this to be reached if the queue is empty - logging.info( - f"Worker {worker_id}: get_item_from_mp_queue: asyncio.to_thread RETURNED NORMALLY with item: {item}.") - return item - except asyncio.CancelledError: - # This is where it SHOULD go if the task awaiting this coroutine is cancelled, - # and asyncio.to_thread correctly propagates the cancellation to its awaiter. - logging.error( - f"Worker {worker_id}: get_item_from_mp_queue: CAUGHT CancelledError from asyncio.to_thread directly!") - raise # Re-raise to propagate the cancellation - except Exception as e: - logging.error(f"Worker {worker_id}: get_item_from_mp_queue: CAUGHT an UNEXPECTED EXCEPTION {type(e)}: {e}", - exc_info=True) - raise - finally: - # This finally block will execute. The key is whether the CancelledError was caught above. - logging.info(f"Worker {worker_id}: get_item_from_mp_queue: EXITED (finally block).") - - -async def worker_coroutine(worker_id: int, q: multiprocessing.Queue): - """ - The main coroutine for our worker task. It will try to get an item - from the queue. - """ - logging.info(f"Worker {worker_id}: worker_coroutine: STARTED.") - try: - logging.info(f"Worker {worker_id}: worker_coroutine: About to await get_item_from_mp_queue.") - # This is the await point where CancelledError should be injected - # if this worker_coroutine task is cancelled. - await get_item_from_mp_queue(q, worker_id) - logging.info(f"Worker {worker_id}: worker_coroutine: get_item_from_mp_queue completed (unexpectedly).") - except asyncio.CancelledError: - logging.error(f"Worker {worker_id}: worker_coroutine: SUCCESSFULLY CAUGHT CancelledError.") - # Perform any task-specific cleanup here if needed - except Exception as e: - logging.error(f"Worker {worker_id}: worker_coroutine: CAUGHT UNEXPECTED EXCEPTION {type(e)}: {e}", - exc_info=True) - finally: - logging.info(f"Worker {worker_id}: worker_coroutine: FINISHED (finally block).") - - -async def main_orchestrator(): - """ - Orchestrates the test: creates, runs, and cancels the worker. - """ - logging.info("Main Orchestrator: Starting worker task.") - worker_task = asyncio.create_task(worker_coroutine(1, mp_queue), name="WorkerCoroutine-1") - - # Give the worker task a moment to start and block on the queue - logging.info("Main Orchestrator: Sleeping for 1 second to let worker block...") - await asyncio.sleep(1) - - logging.info(f"Main Orchestrator: Current active threads: {[t.name for t_ in threading.enumerate()]}...") - - # Cancel the worker task - print("Main Orchestrator: Cancelling worker_task...") - worker_task.cancel() - - # Wait for the worker task to finish, with a timeout. - # If cancellation works as expected, worker_task should complete (by handling CancelledError) - # well before the timeout. - # If it gets stuck, asyncio.TimeoutError will be raised. - timeout_seconds = 5.0 - logging.info(f"Main Orchestrator: Awaiting worker_task with timeout {timeout_seconds}s...") - try: - await asyncio.wait_for(worker_task, timeout=timeout_seconds) - logging.info("Main Orchestrator: worker_task completed WITHOUT timeout.") - except asyncio.TimeoutError: - logging.error( - f"Main Orchestrator: TIMEOUT! worker_task did not finish within {timeout_seconds}s after cancellation.") - logging.error( - f"Main Orchestrator: worker_task.done() = {worker_task.done()}, worker_task.cancelled() = {worker_task.cancelled()}") - # At this point, the thread running mp_queue.get() is likely still blocked. - except asyncio.CancelledError: - # This would happen if main_orchestrator itself was cancelled, not expected here. - logging.error("Main Orchestrator: main_orchestrator itself was cancelled (unexpected).") - except Exception as e: - logging.error(f"Main Orchestrator: An unexpected error occurred while waiting for worker_task: {e}", - exc_info=True) - finally: - logging.info("Main Orchestrator: Test finished.") - # Note: The thread started by asyncio.to_thread for mp_queue.get() - # might still be alive and blocked if q.get() wasn't unblocked. - # It's a daemon thread by default, so it won't prevent program exit. - # To clean it up, one would typically put a sentinel into mp_queue. - # For this test, we are focused on the asyncio task cancellation. - logging.info( - f"Main Orchestrator: Final check: worker_task.done() = {worker_task.done()}, worker_task.cancelled() = {worker_task.cancelled()}") - - # Attempt to unblock the queue to allow the thread to exit, - # though the test's focus is on the asyncio cancellation. - try: - mp_queue.put_nowait(None) # Sentinel - logging.info("Main Orchestrator: Put sentinel in mp_queue to unblock thread.") - except Exception: - logging.warning("Main Orchestrator: Could not put sentinel in mp_queue.") - - -if __name__ == "__main__": - # For multiprocessing queues to work correctly, especially on Windows/macOS - # with 'spawn' or 'forkserver' start methods, it's good practice - # to ensure the queue is created in the main process scope before tasks. - # In this simple script, it's fine. - try: - asyncio.run(main_orchestrator()) - except KeyboardInterrupt: - logging.info("Main Orchestrator: Keyboard interrupt received.") - finally: - mp_queue.close() - mp_queue.join_thread() # Ensure queue's feeder thread is joined - logging.info("Main Orchestrator: mp_queue resources released.") diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py index 4bf15971..5cbf2f7e 100644 --- a/src/guidellm/scheduler/result.py +++ b/src/guidellm/scheduler/result.py @@ -50,6 +50,7 @@ class SchedulerRunInfo(StandardBaseModel): end_number: float processes: int strategy: SchedulingStrategy + last_requests_statuses: deque[RequestStatus] max_error_rate: Optional[float] = None created_requests: int = 0 @@ -59,8 +60,6 @@ class SchedulerRunInfo(StandardBaseModel): completed_requests: int = 0 errored_requests: int = 0 - last_requests_statuses: Optional[deque[RequestStatus]] = None - class SchedulerRequestInfo(StandardBaseModel): """ diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index f6129d14..4e5bca3a 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -1,6 +1,5 @@ -import collections -from datetime import timedelta import asyncio +import collections import math import multiprocessing import multiprocessing.queues @@ -11,8 +10,10 @@ from typing import ( Any, Generic, + Literal, Optional, - Union, Literal, cast, + Union, + cast, ) from loguru import logger @@ -128,9 +129,7 @@ async def run( requests_queue, responses_queue, shutdown_event, - ) = await self._start_processes( - manager, executor, scheduling_strategy - ) + ) = await self._start_processes(manager, executor, scheduling_strategy) if shutdown_event.is_set(): raise RuntimeError("shutdown_event is set before starting scheduling") @@ -156,7 +155,6 @@ async def run( ): # we've exhausted all requests we've wanted to run # and yielded all responses - logger.info("run_info.completed_requests >= run_info.created_requests") break requests_iter = self._add_requests( @@ -229,10 +227,7 @@ def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: f"{max_error} (max error)" ) return max_error < run_info.errored_requests - elif( - run_info.strategy.type_ == "constant" - and run_info.end_number != math.inf - ): + elif run_info.strategy.type_ == "constant" and run_info.end_number != math.inf: current_error_ratio = run_info.errored_requests / run_info.end_number logger.debug( f"Current error rate {current_error_ratio} " @@ -241,13 +236,12 @@ def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: return max_error < current_error_ratio elif settings.error_check_window_size <= run_info.completed_requests: last_requests_statuses = run_info.last_requests_statuses - last_errored_requests_count = len([ - s - for s - in last_requests_statuses - if s == "error" - ]) - current_error_ratio = last_errored_requests_count / len(last_requests_statuses) + last_errored_requests_count = len( + [s for s in last_requests_statuses if s == "error"] + ) + current_error_ratio = last_errored_requests_count / len( + last_requests_statuses + ) logger.debug( f"Current error rate in " f"last requests window is " @@ -353,7 +347,7 @@ def _run_setup( max_error_rate=max_error_rate, last_requests_statuses=collections.deque( maxlen=settings.error_check_window_size - ) + ), ) return info, requests_iter, times_iter @@ -471,8 +465,7 @@ def _check_result_ready( run_info.errored_requests += 1 request_status: Literal["error", "success"] = cast( - Literal["error", "success"], - "error" if is_errored else "success" + "Literal['error', 'success']", "error" if is_errored else "success" ) run_info.last_requests_statuses.append(request_status) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index ce875409..784d4c21 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -2,12 +2,14 @@ import math import multiprocessing.queues import queue +import threading import time +import typing from abc import ABC, abstractmethod from collections.abc import AsyncGenerator from dataclasses import dataclass -from datetime import timedelta from multiprocessing.synchronize import Event as MultiprocessingEvent +from threading import Event from typing import ( Any, Generic, @@ -42,7 +44,7 @@ ] -class ShutdownSignalReceived(Exception): +class ShutdownSignalReceivedError(Exception): pass @@ -127,11 +129,12 @@ async def resolve( ... async def get_request( - self, requests_queue: multiprocessing.Queue, - shutdown_event: MultiprocessingEvent, - process_id: int, - shutdown_poll_interval_seconds: float, - ) -> Optional[WorkerProcessRequest[RequestT]]: + self, + requests_queue: multiprocessing.Queue, + shutdown_event: threading.Event, + process_id: int, + shutdown_poll_interval_seconds: float, + ) -> WorkerProcessRequest[RequestT]: # We need to check shutdown_event intermittently cause # if we simply use asyncio.to_thread(requests_queue.get) # the cancellation task doesn't propagate because the @@ -140,11 +143,12 @@ def _get_queue_intermittently(): while True: try: return requests_queue.get(timeout=shutdown_poll_interval_seconds) - except queue.Empty: + except queue.Empty as e: logger.info("Checking shutdown even is set in get_request") if shutdown_event.is_set(): logger.info(f"Shutdown signal received in future {process_id}") - raise asyncio.CancelledError() + raise asyncio.CancelledError from e + return await asyncio.to_thread(_get_queue_intermittently) # type: ignore[attr-defined] async def send_result( @@ -212,45 +216,56 @@ async def resolve_scheduler_request( asyncio.create_task(self.send_result(results_queue, result)) def run_process( - self, - type_: Literal["sync", "async"], - requests_queue: multiprocessing.Queue, - results_queue: multiprocessing.Queue, - shutdown_event: multiprocessing.Event, - shutdown_poll_interval_seconds: float, - process_id: int, - max_concurrency: Optional[int] = None, + self, + type_: Literal["sync", "async"], + requests_queue: multiprocessing.Queue, + results_queue: multiprocessing.Queue, + shutdown_event: MultiprocessingEvent, + shutdown_poll_interval_seconds: float, + process_id: int, + max_concurrency: Optional[int] = None, ): async def _process_runner(): - import threading - internal_shutdown_event = threading.Event() + # We are using a separate internal event + # because if we're using the shutdown_event + # there's a race condition between the get_request + # loop which checks for shutdown and the .cancel() in this + # method which causes the asyncio.CancelledError + # to propagate and crash the worker + internal_shutdown_event: threading.Event = Event() if type_ == "sync": - loop_task = asyncio.create_task(self._process_synchronous_requests_loop( - requests_queue=requests_queue, - results_queue=results_queue, - process_id=process_id, - shutdown_event=internal_shutdown_event, - shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, - ), name="request_loop_processor_task") + loop_task = asyncio.create_task( + self._process_synchronous_requests_loop( + requests_queue=requests_queue, + results_queue=results_queue, + process_id=process_id, + shutdown_event=internal_shutdown_event, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, + ), + name="request_loop_processor_task", + ) elif type_ == "async": if max_concurrency is None: - raise ValueError("max_concurrency must be set " - "for async processor") - loop_task = asyncio.create_task(self._process_asynchronous_requests_loop( - requests_queue=requests_queue, - results_queue=results_queue, - max_concurrency=max_concurrency, - process_id=process_id, - shutdown_event=internal_shutdown_event, - shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, - ), name="request_loop_processor_task") + raise ValueError("max_concurrency must be set for async processor") + loop_task = asyncio.create_task( + self._process_asynchronous_requests_loop( + requests_queue=requests_queue, + results_queue=results_queue, + max_concurrency=max_concurrency, + process_id=process_id, + shutdown_event=internal_shutdown_event, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, + ), + name="request_loop_processor_task", + ) else: raise ValueError(f"Invalid process type: {type_}") shutdown_task = asyncio.create_task( self._wait_for_shutdown( shutdown_event=shutdown_event, - shutdown_poll_interval=shutdown_poll_interval_seconds + shutdown_poll_interval=shutdown_poll_interval_seconds, + process_id=process_id, ), name="shutdown_task", ) @@ -262,22 +277,26 @@ async def _process_runner(): ], return_when=asyncio.FIRST_EXCEPTION, ) - logger.info(f"First exception happened, done: [{[r.get_name() for r in done]}") + logger.info( + f"First exception happened, done: [{[r.get_name() for r in done]}" + ) for task in pending: - logger.debug(f"Cancelling task {task.get_name()}") - cancel_result = task.cancel() + logger.debug( + f"Cancelling task {task.get_name()}|| Process {process_id}" + ) + task.cancel() internal_shutdown_event.set() - logger.debug(f"{'Task is already done or canceled' if not cancel_result else 'sent cancel signal'}") - try: + try: # noqa: SIM105 await task except asyncio.CancelledError: pass for task in done: - task_exception = task.exception() - if not isinstance(task_exception, ShutdownSignalReceived): + task_exception = typing.cast("Exception", task.exception()) + if not isinstance(task_exception, ShutdownSignalReceivedError): raise task_exception + try: asyncio.run(_process_runner()) except Exception as exc: # noqa: BLE001 @@ -290,32 +309,35 @@ async def _process_runner(): shutdown_event.set() # ensure shutdown event is set to stop other processes async def _wait_for_shutdown( - self, - shutdown_event: MultiprocessingEvent, - shutdown_poll_interval: float, + self, + shutdown_event: MultiprocessingEvent, + shutdown_poll_interval: float, + process_id: int, ): - while not shutdown_event.is_set(): + while not shutdown_event.is_set(): # noqa: ASYNC110 await asyncio.sleep(shutdown_poll_interval) # Raising asyncio.CancelledError instead would # cause the asyncio.wait above to wait # forever, couldn't find a reasonable reason why - raise ShutdownSignalReceived("Shutdown event set, cancelling process loop.") + raise ShutdownSignalReceivedError( + f"Shutdown event set for process {process_id}, cancelling process loop." + ) async def _process_synchronous_requests_loop( - self, - requests_queue: multiprocessing.Queue, - results_queue: multiprocessing.Queue, - process_id: int, - shutdown_event: MultiprocessingEvent, - shutdown_poll_interval_seconds: float, + self, + requests_queue: multiprocessing.Queue, + results_queue: multiprocessing.Queue, + process_id: int, + shutdown_event: threading.Event, + shutdown_poll_interval_seconds: float, ): while True: process_request = await self.get_request( requests_queue=requests_queue, shutdown_event=shutdown_event, process_id=process_id, - shutdown_poll_interval_seconds=shutdown_poll_interval_seconds + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, ) dequeued_time = time.time() @@ -336,7 +358,7 @@ async def _process_asynchronous_requests_loop( results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, - shutdown_event: MultiprocessingEvent, + shutdown_event: threading.Event, shutdown_poll_interval_seconds: float, ): pending = asyncio.Semaphore(max_concurrency) @@ -431,36 +453,25 @@ async def prepare_multiprocessing(self): """ await self.backend.prepare_multiprocessing() - def process_loop_synchronous( + def run_process( self, + type_: Literal["sync", "async"], requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, + shutdown_event: MultiprocessingEvent, + shutdown_poll_interval_seconds: float, process_id: int, - shutdown_event: Optional[MultiprocessingEvent] = None, + max_concurrency: Optional[int] = None, ): asyncio.run(self.backend.validate()) - super().process_loop_synchronous( + super().run_process( + type_=type_, requests_queue=requests_queue, results_queue=results_queue, - process_id=process_id, shutdown_event=shutdown_event, - ) - - def process_loop_asynchronous( - self, - requests_queue: multiprocessing.Queue, - results_queue: multiprocessing.Queue, - max_concurrency: int, - process_id: int, - shutdown_event: Optional[MultiprocessingEvent] = None, - ): - asyncio.run(self.backend.validate()) - super().process_loop_asynchronous( - requests_queue=requests_queue, - results_queue=results_queue, - max_concurrency=max_concurrency, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, process_id=process_id, - shutdown_event=shutdown_event, + max_concurrency=max_concurrency, ) async def resolve( From 640744c4c8ee68b830bbe64fe0735a500a06bbb2 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Thu, 5 Jun 2025 14:32:21 +0300 Subject: [PATCH 35/37] style + type fixes --- src/guidellm/__main__.py | 21 ++++++++++------- src/guidellm/benchmark/benchmark.py | 6 ++--- src/guidellm/benchmark/benchmarker.py | 14 ++++++----- src/guidellm/benchmark/entrypoints.py | 4 ++-- src/guidellm/benchmark/output.py | 2 +- src/guidellm/scheduler/result.py | 2 +- src/guidellm/scheduler/scheduler.py | 34 +++++++++++++-------------- 7 files changed, 45 insertions(+), 38 deletions(-) diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 48ccaeed..a363804b 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -164,14 +164,19 @@ def cli(): ), ) @click.option( - "--max-error-rate", + "--max-error", type=float, help=( - "The maximum error rate after which a benchmark will stop. " - "Applicable only for finite deterministic scenarios i.e " - "rate_type is 'constant' and 'max_seconds' exists OR " - "'max_requests' exists OR the dataset is finite. " - "If None or not applicable, benchmarks will continue regardless of error rate." + "The maximum error after which a benchmark will stop. " + "Can either be a rate i.e 0 < rate < 1 or constant number. " + "If rate is given and rate_type is 'constant' and 'max_seconds' exists " + "then the rate will be calculated as part of the total expected " + "requests count i.e rate * duration. If rate is given and number" + "of requests is not pre-determined than a context window " + "of the last requests will be looked at. Context window size" + "is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE." + "If a number above 1 is given than we just count the total" + "number of error and check if it's above the threshold." ), ) @click.option( @@ -253,7 +258,7 @@ def benchmark( rate, max_seconds, max_requests, - max_error_rate, + max_error, warmup_percent, cooldown_percent, disable_progress, @@ -279,7 +284,7 @@ def benchmark( rate=rate, max_seconds=max_seconds, max_requests=max_requests, - max_error_rate=max_error_rate, + max_error=max_error, warmup_percent=warmup_percent, cooldown_percent=cooldown_percent, show_progress=not disable_progress, diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index d33e6a56..c2d8c011 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -90,8 +90,8 @@ class BenchmarkArgs(StandardBaseModel): max_duration: Optional[float] = Field( description="The maximum duration in seconds to run this benchmark, if any." ) - max_error_rate: Optional[float] = Field( - description="Maximum error rate after which a benchmark will stop." + max_error: Optional[float] = Field( + description="Maximum error rate or const after which a benchmark will stop." ) warmup_number: Optional[int] = Field( description=( @@ -220,7 +220,7 @@ class BenchmarkRunStats(StandardBaseModel): description=( "The number of errored requests divided by the number " "of successful and errored requests. " - "This can be higher than max_error_rate " + "This can be higher than max_error " "(if applicable) cause it does not take into " "account incomplete requests." ) diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py index dddcadb5..7a9f41ee 100644 --- a/src/guidellm/benchmark/benchmarker.py +++ b/src/guidellm/benchmark/benchmarker.py @@ -74,8 +74,10 @@ class BenchmarkerStrategyLimits(StandardBaseModel): description="Maximum duration (in seconds) to process requests per strategy.", ge=0, ) - max_error_rate: Optional[float] = Field( - description="Maximum error rate after which a benchmark will stop", + max_error: Optional[float] = Field( + description="Maximum error after which a " + "benchmark will stop," + " either rate or fixed number", ge=0, ) warmup_percent_per_strategy: Optional[float] = Field( @@ -152,7 +154,7 @@ async def run( profile: Profile, max_number_per_strategy: Optional[int], max_duration_per_strategy: Optional[float], - max_error_rate: Optional[float], + max_error: Optional[float], warmup_percent_per_strategy: Optional[float], cooldown_percent_per_strategy: Optional[float], ) -> AsyncGenerator[ @@ -167,7 +169,7 @@ async def run( requests_loader_size=requests_loader_size, max_number_per_strategy=max_number_per_strategy, max_duration_per_strategy=max_duration_per_strategy, - max_error_rate=max_error_rate, + max_error=max_error, warmup_percent_per_strategy=warmup_percent_per_strategy, cooldown_percent_per_strategy=cooldown_percent_per_strategy, ) @@ -202,7 +204,7 @@ async def run( scheduling_strategy=scheduling_strategy, max_number=max_number_per_strategy, max_duration=max_duration_per_strategy, - max_error_rate=max_error_rate, + max_error=max_error, ): if result.type_ == "run_start": yield BenchmarkerResult( @@ -328,7 +330,7 @@ def create_benchmark_aggregator( strategy=strategy, max_number=limits.max_number, max_duration=limits.max_duration, - max_error_rate=limits.max_error_rate, + max_error=limits.max_error, warmup_number=limits.warmup_number, warmup_duration=limits.warmup_duration, cooldown_number=limits.cooldown_number, diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py index a5e4da3b..e70ae0a6 100644 --- a/src/guidellm/benchmark/entrypoints.py +++ b/src/guidellm/benchmark/entrypoints.py @@ -41,7 +41,7 @@ async def benchmark_generative_text( rate: Optional[Union[float, list[float]]], max_seconds: Optional[float], max_requests: Optional[int], - max_error_rate: Optional[float], + max_error: Optional[float], warmup_percent: Optional[float], cooldown_percent: Optional[float], show_progress: bool, @@ -108,7 +108,7 @@ async def benchmark_generative_text( profile=profile, max_number_per_strategy=max_requests, max_duration_per_strategy=max_seconds, - max_error_rate=max_error_rate, + max_error=max_error, warmup_percent_per_strategy=warmup_percent, cooldown_percent_per_strategy=cooldown_percent, ): diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py index 33b1efc2..ac32bc4f 100644 --- a/src/guidellm/benchmark/output.py +++ b/src/guidellm/benchmark/output.py @@ -419,7 +419,7 @@ def benchmarks_args_str(self) -> str: { "max_number": args.max_number, "max_duration": args.max_duration, - "max_error_rate": args.max_error_rate, + "max_error": args.max_error, "warmup_number": args.warmup_number, "warmup_duration": args.warmup_duration, "cooldown_number": args.cooldown_number, diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py index 5cbf2f7e..4f4d5c87 100644 --- a/src/guidellm/scheduler/result.py +++ b/src/guidellm/scheduler/result.py @@ -51,7 +51,7 @@ class SchedulerRunInfo(StandardBaseModel): processes: int strategy: SchedulingStrategy last_requests_statuses: deque[RequestStatus] - max_error_rate: Optional[float] = None + max_error: Optional[float] = None created_requests: int = 0 queued_requests: int = 0 diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 4e5bca3a..e84c2320 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -76,7 +76,7 @@ async def run( scheduling_strategy: SchedulingStrategy, max_number: Optional[int] = None, max_duration: Optional[float] = None, - max_error_rate: Optional[float] = None, + max_error: Optional[float] = None, ) -> AsyncGenerator[ Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None ]: @@ -105,8 +105,8 @@ async def run( :param max_duration: The maximum duration for the scheduling run. If None, then no limit is set and either the iterator must be exhaustible or the max_number must be set. - :param max_error_rate: The maximum error rate after which the - scheduler shuts down. + :param max_error: The maximum error rate or const + after which the scheduler shuts down. Only applicable in benchmarks with finite deterministic number of requests. If None or not applicable then scheduler will continue regardless of errors. :return: An asynchronous generator that yields SchedulerResult objects. @@ -114,7 +114,7 @@ async def run( the response, and the run information. """ self._validate_scheduler_params( - scheduling_strategy, max_duration, max_error_rate, max_number + scheduling_strategy, max_duration, max_error, max_number ) with ( @@ -134,7 +134,7 @@ async def run( raise RuntimeError("shutdown_event is set before starting scheduling") run_info, requests_iter, times_iter = self._run_setup( - futures, scheduling_strategy, max_number, max_duration, max_error_rate + futures, scheduling_strategy, max_number, max_duration, max_error ) yield SchedulerResult( type_="run_start", @@ -142,8 +142,8 @@ async def run( ) try: - max_error_rate_reached = False - while not max_error_rate_reached: + max_error_reached = False + while not max_error_reached: # check errors and raise them for future in futures: if future.done() and (err := future.exception()) is not None: @@ -173,13 +173,13 @@ async def run( if ( iter_result.request_info.errored and not iter_result.request_info.canceled - and self._is_max_error_rate_reached(iter_result.run_info) + and self._is_max_error_reached(iter_result.run_info) ): shutdown_event.set() - max_error_rate_reached = True + max_error_reached = True logger.info( f"Max error rate of " - f"({iter_result.run_info.max_error_rate}) " + f"({iter_result.run_info.max_error}) " f"reached, sending shutdown signal" ) yield iter_result @@ -200,7 +200,7 @@ def _validate_scheduler_params( self, scheduling_strategy: SchedulingStrategy, max_duration: Optional[float], - max_error_rate: Optional[float], + max_error: Optional[float], max_number: Optional[int], ) -> None: if scheduling_strategy is None or not isinstance( @@ -211,11 +211,11 @@ def _validate_scheduler_params( raise ValueError(f"Invalid max_number: {max_number}") if max_duration is not None and max_duration < 0: raise ValueError(f"Invalid max_duration: {max_duration}") - if max_error_rate is not None and (max_error_rate < 0): - raise ValueError(f"Invalid max_error_rate: {max_error_rate}") + if max_error is not None and (max_error < 0): + raise ValueError(f"Invalid max_error: {max_error}") - def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: - max_error = run_info.max_error_rate + def _is_max_error_reached(self, run_info: SchedulerRunInfo) -> bool: + max_error = run_info.max_error if max_error is None: return False @@ -322,7 +322,7 @@ def _run_setup( scheduling_strategy: SchedulingStrategy, max_number: Optional[int], max_duration: Optional[float], - max_error_rate: Optional[float], + max_error: Optional[float], ) -> tuple[SchedulerRunInfo, Iterator[Any], Iterator[float]]: requests_iter = iter(self.request_loader) start_time = time.time() @@ -344,7 +344,7 @@ def _run_setup( end_number=end_number, processes=len(processes), strategy=scheduling_strategy, - max_error_rate=max_error_rate, + max_error=max_error, last_requests_statuses=collections.deque( maxlen=settings.error_check_window_size ), From 5783d629efcf7591f48136472965f89f6478a1b2 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Thu, 5 Jun 2025 14:32:34 +0300 Subject: [PATCH 36/37] fix tests --- tests/unit/benchmark/test_output.py | 2 +- tests/unit/mock_benchmark.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/benchmark/test_output.py b/tests/unit/benchmark/test_output.py index e3114491..de32b44b 100644 --- a/tests/unit/benchmark/test_output.py +++ b/tests/unit/benchmark/test_output.py @@ -113,7 +113,7 @@ def test_console_benchmarks_args_str(): mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] assert console.benchmarks_args_str == ( - "max_number=None, max_duration=10.0, max_error_rate=0.05, warmup_number=None, " + "max_number=None, max_duration=10.0, max_error=0.05, warmup_number=None, " "warmup_duration=None, cooldown_number=None, cooldown_duration=None" ) diff --git a/tests/unit/mock_benchmark.py b/tests/unit/mock_benchmark.py index 3c360c68..4a8a1f29 100644 --- a/tests/unit/mock_benchmark.py +++ b/tests/unit/mock_benchmark.py @@ -221,7 +221,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark: strategy=SynchronousStrategy(), max_number=None, max_duration=10.0, - max_error_rate=0.05, + max_error=0.05, warmup_number=None, warmup_duration=None, cooldown_number=None, From 85cb24d904e844ce3c7a25a78fb8a47f9858c8f3 Mon Sep 17 00:00:00 2001 From: mark-vaykhansky Date: Thu, 5 Jun 2025 14:33:56 +0300 Subject: [PATCH 37/37] invert if --- src/guidellm/scheduler/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index e84c2320..31ac5c61 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -219,7 +219,7 @@ def _is_max_error_reached(self, run_info: SchedulerRunInfo) -> bool: if max_error is None: return False - if not max_error < 1: + if max_error >= 1: # Absolute error count, i.e not a ratio logger.debug( f"Current error count "