Skip to content

Commit e7dc638

Browse files
committed
Fix minor conflicts
Signed-off-by: Thomas Parnell <[email protected]>
2 parents bf931dd + 310468c commit e7dc638

File tree

208 files changed

+6795
-4381
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

208 files changed

+6795
-4381
lines changed

.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh

+107
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,104 @@ run_serving_tests() {
301301
kill_gpu_processes
302302
}
303303

304+
run_genai_perf_tests() {
305+
# run genai-perf tests
306+
307+
# $1: a json file specifying genai-perf test cases
308+
local genai_perf_test_file
309+
genai_perf_test_file=$1
310+
311+
# Iterate over genai-perf tests
312+
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
313+
# get the test name, and append the GPU type back to it.
314+
test_name=$(echo "$params" | jq -r '.test_name')
315+
316+
# if TEST_SELECTOR is set, only run the test cases that match the selector
317+
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
318+
echo "Skip test case $test_name."
319+
continue
320+
fi
321+
322+
# prepend the current serving engine to the test name
323+
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
324+
325+
# get common parameters
326+
common_params=$(echo "$params" | jq -r '.common_parameters')
327+
model=$(echo "$common_params" | jq -r '.model')
328+
tp=$(echo "$common_params" | jq -r '.tp')
329+
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
330+
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
331+
port=$(echo "$common_params" | jq -r '.port')
332+
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
333+
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
334+
335+
# get client and server arguments
336+
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
337+
qps_list=$(echo "$params" | jq -r '.qps_list')
338+
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
339+
echo "Running over qps list $qps_list"
340+
341+
# check if there is enough GPU to run the test
342+
if [[ $gpu_count -lt $tp ]]; then
343+
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
344+
continue
345+
fi
346+
347+
if [[ $reuse_server == "true" ]]; then
348+
echo "Reuse previous server for test case $test_name"
349+
else
350+
kill_gpu_processes
351+
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
352+
"$server_params" "$common_params"
353+
fi
354+
355+
if wait_for_server; then
356+
echo ""
357+
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
358+
else
359+
echo ""
360+
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
361+
break
362+
fi
363+
364+
# iterate over different QPS
365+
for qps in $qps_list; do
366+
# remove the surrounding single quote from qps
367+
if [[ "$qps" == *"inf"* ]]; then
368+
echo "qps was $qps"
369+
qps=$num_prompts
370+
echo "now qps is $qps"
371+
fi
372+
373+
new_test_name=$test_name"_qps_"$qps
374+
backend=$CURRENT_LLM_SERVING_ENGINE
375+
376+
if [[ "$backend" == *"vllm"* ]]; then
377+
backend="vllm"
378+
fi
379+
#TODO: add output dir.
380+
client_command="genai-perf profile \
381+
-m $model \
382+
--service-kind openai \
383+
--backend vllm \
384+
--endpoint-type chat \
385+
--streaming \
386+
--url localhost:$port \
387+
--request-rate $qps \
388+
--num-prompts $num_prompts \
389+
"
390+
391+
echo "Client command: $client_command"
392+
393+
eval "$client_command"
394+
395+
#TODO: process/record outputs
396+
done
397+
done
398+
399+
kill_gpu_processes
400+
401+
}
304402

305403
prepare_dataset() {
306404

@@ -328,12 +426,17 @@ main() {
328426

329427
pip install -U transformers
330428

429+
pip install -r requirements-dev.txt
430+
which genai-perf
431+
331432
# check storage
332433
df -h
333434

334435
ensure_installed wget
335436
ensure_installed curl
336437
ensure_installed jq
438+
# genai-perf dependency
439+
ensure_installed libb64-0d
337440

338441
prepare_dataset
339442

@@ -345,6 +448,10 @@ main() {
345448
# run the test
346449
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
347450

451+
# run genai-perf tests
452+
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
453+
mv artifacts/ $RESULTS_FOLDER/
454+
348455
# upload benchmark results to buildkite
349456
python3 -m pip install tabulate pandas
350457
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[
2+
{
3+
"test_name": "llama8B_tp1_genai_perf",
4+
"qps_list": [4,8,16,32],
5+
"common_parameters": {
6+
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
7+
"tp": 1,
8+
"port": 8000,
9+
"num_prompts": 500,
10+
"reuse_server": false
11+
},
12+
"vllm_server_parameters": {
13+
"disable_log_stats": "",
14+
"disable_log_requests": "",
15+
"gpu_memory_utilization": 0.9,
16+
"num_scheduler_steps": 10,
17+
"max_num_seqs": 512,
18+
"dtype": "bfloat16"
19+
},
20+
"genai_perf_input_parameters": {
21+
}
22+
}
23+
]

.buildkite/run-cpu-test.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,6 @@ function cpu_tests() {
8383
tests/lora/test_qwen2vl.py"
8484
}
8585

86-
# All of CPU tests are expected to be finished less than 25 mins.
86+
# All of CPU tests are expected to be finished less than 40 mins.
8787
export -f cpu_tests
88-
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
88+
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/run-hpu-test.sh

+10-2
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,17 @@ set -ex
88
docker build -t hpu-test-env -f Dockerfile.hpu .
99

1010
# Setup cleanup
11+
# certain versions of HPU software stack have a bug that can
12+
# override the exit code of the script, so we need to use
13+
# separate remove_docker_container and remove_docker_container_and_exit
14+
# functions, while other platforms only need one remove_docker_container
15+
# function.
16+
EXITCODE=1
1117
remove_docker_container() { docker rm -f hpu-test || true; }
12-
trap remove_docker_container EXIT
18+
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
19+
trap remove_docker_container_and_exit EXIT
1320
remove_docker_container
1421

1522
# Run the image and launch offline inference
16-
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
23+
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
24+
EXITCODE=$?

.buildkite/test-pipeline.yaml

+8-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ steps:
107107
source_file_dependencies:
108108
- vllm/
109109
commands:
110-
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
110+
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
111111
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
112112
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
113113
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
@@ -126,11 +126,15 @@ steps:
126126
- tests/distributed
127127
- tests/spec_decode/e2e/test_integration_dist_tp4
128128
- tests/compile
129+
- examples/offline_inference/rlhf.py
129130
commands:
130131
- pytest -v -s distributed/test_utils.py
131132
- pytest -v -s compile/test_basic_correctness.py
132133
- pytest -v -s distributed/test_pynccl.py
133134
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
135+
# TODO: create a dedicated test section for multi-GPU example tests
136+
# when we have multiple distributed example tests
137+
- python3 ../examples/offline_inference/rlhf.py
134138

135139
- label: Metrics, Tracing Test # 10min
136140
num_gpus: 2
@@ -462,7 +466,10 @@ steps:
462466
- vllm/worker/worker_base.py
463467
- vllm/worker/worker.py
464468
- vllm/worker/model_runner.py
469+
- entrypoints/llm/test_collective_rpc.py
465470
commands:
471+
- pytest -v -s entrypoints/llm/test_collective_rpc.py
472+
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
466473
- pytest -v -s ./compile/test_basic_correctness.py
467474
- pytest -v -s ./compile/test_wrapper.py
468475
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'

.github/workflows/test-spyre.yml

+4-9
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,6 @@
11
name: test-sypre
22

3-
on:
4-
# Trigger the workflow on push or pull request,
5-
# but only for the main branch
6-
push:
7-
branches:
8-
- main
9-
pull_request:
10-
branches:
11-
- main
3+
on: pull_request
124

135
jobs:
146
test-spyre:
@@ -28,5 +20,8 @@ jobs:
2820
python3.12 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer(\"sentence-transformers/all-roberta-large-v1\")" && \
2921
export VARIANT=$(ls /root/.cache/huggingface/hub/models--sentence-transformers--all-roberta-large-v1/snapshots/) && \
3022
ln -s /root/.cache/huggingface/hub/models--sentence-transformers--all-roberta-large-v1/snapshots/${VARIANT} /models/all-roberta-large-v1 && \
23+
export MASTER_PORT=12355 && \
24+
export MASTER_ADDR=localhost && \
25+
export DISTRIBUTED_STRATEGY_IGNORE_MODULES=WordEmbedding && \
3126
python3.12 -m pytest tests/spyre -v
3227
'''

Dockerfile.cpu

+3-3
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,20 @@ RUN pip install intel_extension_for_pytorch==2.5.0
2626

2727
WORKDIR /workspace
2828

29-
COPY requirements-build.txt requirements-build.txt
3029
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
3130
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
3231
RUN --mount=type=cache,target=/root/.cache/pip \
32+
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
3333
pip install --upgrade pip && \
3434
pip install -r requirements-build.txt
3535

3636
FROM cpu-test-1 AS build
3737

3838
WORKDIR /workspace/vllm
3939

40-
COPY requirements-common.txt requirements-common.txt
41-
COPY requirements-cpu.txt requirements-cpu.txt
4240
RUN --mount=type=cache,target=/root/.cache/pip \
41+
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
42+
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
4343
pip install -v -r requirements-cpu.txt
4444

4545
COPY . .

Dockerfile.hpu

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
1+
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
22

33
COPY ./ /workspace/vllm
44

README.md

+7-5
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ We will try to rebase against upstream frequently and we plan to contribute thes
4242
## About
4343
vLLM is a fast and easy-to-use library for LLM inference and serving.
4444

45+
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
46+
4547
vLLM is fast with:
4648

4749
- State-of-the-art serving throughput
@@ -76,16 +78,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod
7678

7779
## Getting Started
7880

79-
Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
81+
Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):
8082

8183
```bash
8284
pip install vllm
8385
```
8486

85-
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
86-
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
87-
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
88-
- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
87+
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
88+
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
89+
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
90+
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
8991

9092
## Contributing
9193

benchmarks/backend_request_func.py

+24-3
Original file line numberDiff line numberDiff line change
@@ -417,14 +417,35 @@ def get_model(pretrained_model_name_or_path: str) -> str:
417417

418418

419419
def get_tokenizer(
420-
pretrained_model_name_or_path: str, trust_remote_code: bool
420+
pretrained_model_name_or_path: str,
421+
tokenizer_mode: str = "auto",
422+
trust_remote_code: bool = False,
423+
**kwargs,
421424
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
422425
if pretrained_model_name_or_path is not None and not os.path.exists(
423426
pretrained_model_name_or_path):
424427
pretrained_model_name_or_path = get_model(
425428
pretrained_model_name_or_path)
426-
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
427-
trust_remote_code=trust_remote_code)
429+
if tokenizer_mode == "slow":
430+
if kwargs.get("use_fast", False):
431+
raise ValueError(
432+
"Cannot use the fast tokenizer in slow tokenizer mode.")
433+
kwargs["use_fast"] = False
434+
if tokenizer_mode == "mistral":
435+
try:
436+
from vllm.transformers_utils.tokenizer import MistralTokenizer
437+
except ImportError as e:
438+
raise ImportError("MistralTokenizer requires vllm package.\n"
439+
"Please install it with `pip install vllm` "
440+
"to use mistral tokenizer mode.") from e
441+
return MistralTokenizer.from_pretrained(
442+
str(pretrained_model_name_or_path))
443+
else:
444+
return AutoTokenizer.from_pretrained(
445+
pretrained_model_name_or_path,
446+
trust_remote_code=trust_remote_code,
447+
**kwargs,
448+
)
428449

429450

430451
ASYNC_REQUEST_FUNCS = {

0 commit comments

Comments
 (0)