-
Notifications
You must be signed in to change notification settings - Fork 525
/
endpoint.yaml
42 lines (37 loc) · 1.2 KB
/
endpoint.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# An example yaml for serving Code Llama model from Meta with an OpenAI API.
# Usage:
# 1. Launch on a single instance: `sky launch -c code-llama ./endpoint.yaml`
# 2. Scale up to multiple replicas with a single endpoint:
# `sky serve up -n code-llama ./endpoint.yaml`
service:
readiness_probe:
path: /v1/completions
post_data:
model: codellama/CodeLlama-70b-Instruct-hf
prompt: "def hello_world():"
max_tokens: 1
initial_delay_seconds: 1800
replicas: 2
resources:
accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
disk_size: 1024
disk_tier: best
memory: 32+
ports: 8000
setup: |
conda activate codellama
if [ $? -ne 0 ]; then
conda create -n codellama python=3.10 -y
conda activate codellama
fi
pip install transformers==4.38.0
pip install vllm==0.3.2
run: |
conda activate codellama
export PATH=$PATH:/sbin
# Reduce --max-num-seqs to avoid OOM during loading model on L4:8
python -u -m vllm.entrypoints.openai.api_server \
--host 0.0.0.0 \
--model codellama/CodeLlama-70b-Instruct-hf \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
--max-num-seqs 64 | tee ~/openai_api_server.log