-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_on_rewardbench.sh
69 lines (54 loc) · 4.92 KB
/
test_on_rewardbench.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# num_shots=3
# for rm_type in "pairwise" "individual";
# do
# # 128k base models
# python test_on_rewardbench.py --model_name="meta-llama/Meta-Llama-3.1-8B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# # python test_on_rewardbench.py --model_name="meta-llama/Meta-Llama-3.1-70B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# python test_on_rewardbench.py --model_name="Qwen/Qwen2-0.5B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# python test_on_rewardbench.py --model_name="Qwen/Qwen2-1.5B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# python test_on_rewardbench.py --model_name="Qwen/Qwen2-7B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# # python test_on_rewardbench.py --model_name="Qwen/Qwen2-57B-A14B" --rm_type="${rm_type}" --num_workers=2 --num_gpu_per_worker=4 ---num_shots=${num_shots} --engine="vllm"
# python test_on_rewardbench.py --model_name="Qwen/Qwen2-72B" --rm_type="${rm_type}" --num_workers=1 --num_gpu_per_worker=8 ---num_shots=${num_shots} --engine="vllm"
# ## 8K base models
# python test_on_rewardbench.py --model_name="google/gemma-2-2b" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# # python test_on_rewardbench.py --model_name="google/gemma-2-9b" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# # python test_on_rewardbench.py --model_name="google/gemma-2-27b" --rm_type="${rm_type}" --num_workers=2 --num_gpu_per_worker=4 ---num_shots=${num_shots} --engine="vllm"
# python test_on_rewardbench.py --model_name="meta-llama/Meta-Llama-3-8B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# # other
# # python test_on_rewardbench.py --model_name="gpt-4o-mini" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="openai"
# done
# for rm_type in "pairwise" "individual";
# do
# # 128k base models
# python test_on_rewardbench.py --model_name="meta-llama/Meta-Llama-3.1-70B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# # python test_on_rewardbench.py --model_name="meta-llama/Meta-Llama-3-70B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# # python test_on_rewardbench.py --model_name="Qwen/Qwen2-72B" --rm_type="${rm_type}" --num_workers=1 --num_gpu_per_worker=8 ---num_shots=${num_shots} --engine="vllm"
# python test_on_rewardbench.py --model_name="Qwen/Qwen2-57B-A14B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# done
# for rm_type in "pairwise" "individual";
# do
# for num_shots in 1 3 5 10;
# do
# # 128k base models
# python test_on_rewardbench.py --model_name="meta-llama/Meta-Llama-3.1-8B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm" --results_dir="results_debug"
# # python test_on_rewardbench.py --model_name="Qwen/Qwen2-7B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm" --results_dir="results_debug"
# # python test_on_rewardbench.py --model_name="hugging-quants/Meta-Llama-3.1-8B-BNB-NF4" --quantization "bitsandbytes" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --engine="vllm"
# done
# done
for rm_type in "pairwise";
do
for num_shots in 0 1 3 5;
do
# 128k base models
python test_on_rewardbench.py --model_name="meta-llama/Meta-Llama-3.1-8B" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --completion True --engine="vllm" --results_dir="results_random_completion" --max_tokens 3072
python test_on_rewardbench.py --model_name="meta-llama/Meta-Llama-3.1-8B-Instruct" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --completion False --engine="vllm" --results_dir="results_random_instruct"
# python test_on_rewardbench.py --model_name="meta-llama/Meta-Llama-3.1-8B-Instruct" --rm_type="${rm_type}" --num_workers=8 --num_gpu_per_worker=1 ---num_shots=${num_shots} --completion True --engine="vllm" --results_dir="results_debug_completion" --max_tokens 3072
done
done
# for rm_type in "pairwise" "individual";
# do
# for num_shots in 1 3 5;
# do
# python test_on_rewardbench.py --model_name="meta-llama/Meta-Llama-3.1-70B" --rm_type="${rm_type}" --num_workers=1 --num_gpu_per_worker=8 ---num_shots=${num_shots} --engine="vllm"
# done
# done