You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
2024-08-05 02:25:28,293 vllm.engine.llm_engine 3168 INFO Initializing an LLM engine (v0.5.1) with config: model='/model/qwen2-7b-instruct', speculative_config=None, tokenizer='/model/qwen2-7b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/model/qwen2-7b-instruct, use_v2_block_manager=False, enable_prefix_caching=False)
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file vocab.json
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file merges.txt
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file tokenizer.json
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file added_tokens.json
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file special_tokens_map.json
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file tokenizer_config.json
2024-08-05 02:25:28,502 transformers.tokenization_utils_base 3168 WARNING Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2024-08-05 02:25:28,517 transformers.generation.configuration_utils 3168 INFO loading configuration file /model/qwen2-7b-instruct/generation_config.json
2024-08-05 02:25:28,518 transformers.generation.configuration_utils 3168 INFO Generate config GenerationConfig {
"bos_token_id": 151643,
"do_sample": true,
"eos_token_id": [
151645,
151643
],
"pad_token_id": 151643,
"repetition_penalty": 1.05,
"temperature": 0.7,
"top_k": 20,
"top_p": 0.8
}
2024-08-05 02:25:28,600 xinference.core.worker 140 ERROR Failed to load model qwen2-7b-instruct-128K-1-0
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/xinference/core/worker.py", line 841, in launch_builtin_model
await model_ref.load()
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 230, in send
result = await self._wait(future, actor_ref.address, send_message) # type: ignore
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 115, in _wait
return await future
File "xoscar/serialization/core.pyx", line 933, in xoscar.serialization.core.deserialize
File "xoscar/serialization/core.pyx", line 831, in xoscar.serialization.core._deserial_single
File "xoscar/serialization/core.pyx", line 106, in xoscar.serialization.core.Serializer.deserial
File "/usr/local/lib/python3.10/dist-packages/xoscar/serialization/exception.py", line 45, in deserial
return unpickle_buffers(subs)
File "xoscar/serialization/core.pyx", line 261, in xoscar.serialization.core.unpickle_buffers
TypeError: _extractNVMLErrorsAsClasses..gen_new..new() takes 1 positional argument but 2 were given
2024-08-05 02:25:28,664 xinference.api.restful_api 1 ERROR [address=0.0.0.0:34148, pid=140] _extractNVMLErrorsAsClasses..gen_new..new() takes 1 positional argument but 2 were given
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/xinference/api/restful_api.py", line 835, in launch_model
model_uid = await (await self._get_supervisor_ref()).launch_builtin_model(
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/pool.py", line 656, in send
result = await self._run_coro(message.message_id, coro)
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/pool.py", line 367, in _run_coro
return await coro
File "/usr/local/lib/python3.10/dist-packages/xoscar/api.py", line 384, in on_receive
return await super().on_receive(message) # type: ignore
File "xoscar/core.pyx", line 558, in on_receive
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.on_receive
result = await result
File "/usr/local/lib/python3.10/dist-packages/xinference/core/supervisor.py", line 988, in launch_builtin_model
await _launch_model()
File "/usr/local/lib/python3.10/dist-packages/xinference/core/supervisor.py", line 952, in _launch_model
await _launch_one_model(rep_model_uid)
File "/usr/local/lib/python3.10/dist-packages/xinference/core/supervisor.py", line 932, in _launch_one_model
await worker_ref.launch_builtin_model(
File "xoscar/core.pyx", line 284, in __pyx_actor_method_wrapper
async with lock:
File "xoscar/core.pyx", line 287, in xoscar.core.__pyx_actor_method_wrapper
result = await result
File "/usr/local/lib/python3.10/dist-packages/xinference/core/utils.py", line 45, in wrapped
ret = await func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/xinference/core/worker.py", line 841, in launch_builtin_model
await model_ref.load()
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 230, in send
result = await self._wait(future, actor_ref.address, send_message) # type: ignore
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 115, in _wait
return await future
File "xoscar/serialization/core.pyx", line 933, in xoscar.serialization.core.deserialize
deserialized = _deserial_single(
File "xoscar/serialization/core.pyx", line 831, in xoscar.serialization.core._deserial_single
res = serializer.deserial(serialized[_COMMON_HEADER_LEN:], context, subs)
File "xoscar/serialization/core.pyx", line 106, in xoscar.serialization.core.Serializer.deserial
cpdef deserial(self, tuple serialized, dict context, list subs):
File "/usr/local/lib/python3.10/dist-packages/xoscar/serialization/exception.py", line 45, in deserial
return unpickle_buffers(subs)
File "xoscar/serialization/core.pyx", line 261, in xoscar.serialization.core.unpickle_buffers
result = cloudpickle.loads(buffers[0], buffers=buffers[1:])
TypeError: [address=0.0.0.0:34148, pid=140] _extractNVMLErrorsAsClasses..gen_new..new() takes 1 positional argument but 2 were given
Expected behavior / 期待表现
正常运行
The text was updated successfully, but these errors were encountered:
System Info / 系統信息
xprobe/xinference:735d5dbbb830
Running Xinference with Docker? / 是否使用 Docker 运行 Xinfernece?
Version info / 版本信息
0.13.1
The command used to start Xinference / 用以启动 xinference 的命令
docker run -p 9997:9997 --gpus='"device=4,5,6,7"' xprobe/xinference:latest xinference-local -H 0.0.0.0
Reproduction / 复现过程
自定义模型配置:
{
"version": 1,
"context_length": 128000,
"model_name": "qwen2-7b-instruct-128K",
"model_lang": [
"en",
"zh"
],
"model_ability": [
"generate",
"chat"
],
"model_description": "qwen2-7b-instruct-128K",
"model_family": "qwen2-instruct",
"model_specs": [
{
"model_format": "pytorch",
"model_size_in_billions": 7,
"quantizations": [
"none"
],
"model_id": null,
"model_hub": "huggingface",
"model_uri": "file:///model/qwen2-7b-instruct",
"model_revision": null
}
],
"prompt_style": {
"style_name": "QWEN",
"system_prompt": "You are a helpful assistant.",
"roles": [
"user",
"assistant"
],
"intra_message_sep": "\n",
"inter_message_sep": "",
"stop": [
"<|endoftext|>",
"<|im_start|>",
"<|im_end|>"
],
"stop_token_ids": [
151643,
151644,
151645
]
},
"is_builtin": false
}
启动模型配置:
报错信息:
2024-08-05 02:25:28,266 xinference.model.llm.llm_family 140 INFO Caching from URI: file:///model/qwen2-7b-instruct
2024-08-05 02:25:28,266 xinference.model.llm.llm_family 140 INFO Cache /model/qwen2-7b-instruct exists
2024-08-05 02:25:28,289 xinference.model.llm.vllm.core 3168 INFO Loading qwen2-7b-instruct-128K with following model config: {'tokenizer_mode': 'auto', 'trust_remote_code': True, 'tensor_parallel_size': 1, 'block_size': 16, 'swap_space': 4, 'gpu_memory_utilization': 0.9, 'max_num_seqs': 256, 'quantization': None, 'max_model_len': 4096}Enable lora: False. Lora count: 0.
2024-08-05 02:25:28,291 transformers.configuration_utils 3168 INFO loading configuration file /model/qwen2-7b-instruct/config.json
2024-08-05 02:25:28,292 transformers.configuration_utils 3168 INFO Model config Qwen2Config {
"_name_or_path": "/model/qwen2-7b-instruct",
"architectures": [
"Qwen2ForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"hidden_act": "silu",
"hidden_size": 3584,
"initializer_range": 0.02,
"intermediate_size": 18944,
"max_position_embeddings": 32768,
"max_window_layers": 28,
"model_type": "qwen2",
"num_attention_heads": 28,
"num_hidden_layers": 28,
"num_key_value_heads": 4,
"rms_norm_eps": 1e-06,
"rope_theta": 1000000.0,
"sliding_window": 131072,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.42.3",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 152064
}
2024-08-05 02:25:28,293 vllm.engine.llm_engine 3168 INFO Initializing an LLM engine (v0.5.1) with config: model='/model/qwen2-7b-instruct', speculative_config=None, tokenizer='/model/qwen2-7b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/model/qwen2-7b-instruct, use_v2_block_manager=False, enable_prefix_caching=False)
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file vocab.json
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file merges.txt
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file tokenizer.json
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file added_tokens.json
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file special_tokens_map.json
2024-08-05 02:25:28,294 transformers.tokenization_utils_base 3168 INFO loading file tokenizer_config.json
2024-08-05 02:25:28,502 transformers.tokenization_utils_base 3168 WARNING Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2024-08-05 02:25:28,517 transformers.generation.configuration_utils 3168 INFO loading configuration file /model/qwen2-7b-instruct/generation_config.json
2024-08-05 02:25:28,518 transformers.generation.configuration_utils 3168 INFO Generate config GenerationConfig {
"bos_token_id": 151643,
"do_sample": true,
"eos_token_id": [
151645,
151643
],
"pad_token_id": 151643,
"repetition_penalty": 1.05,
"temperature": 0.7,
"top_k": 20,
"top_p": 0.8
}
2024-08-05 02:25:28,600 xinference.core.worker 140 ERROR Failed to load model qwen2-7b-instruct-128K-1-0
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/xinference/core/worker.py", line 841, in launch_builtin_model
await model_ref.load()
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 230, in send
result = await self._wait(future, actor_ref.address, send_message) # type: ignore
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 115, in _wait
return await future
File "xoscar/serialization/core.pyx", line 933, in xoscar.serialization.core.deserialize
File "xoscar/serialization/core.pyx", line 831, in xoscar.serialization.core._deserial_single
File "xoscar/serialization/core.pyx", line 106, in xoscar.serialization.core.Serializer.deserial
File "/usr/local/lib/python3.10/dist-packages/xoscar/serialization/exception.py", line 45, in deserial
return unpickle_buffers(subs)
File "xoscar/serialization/core.pyx", line 261, in xoscar.serialization.core.unpickle_buffers
TypeError: _extractNVMLErrorsAsClasses..gen_new..new() takes 1 positional argument but 2 were given
2024-08-05 02:25:28,664 xinference.api.restful_api 1 ERROR [address=0.0.0.0:34148, pid=140] _extractNVMLErrorsAsClasses..gen_new..new() takes 1 positional argument but 2 were given
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/xinference/api/restful_api.py", line 835, in launch_model
model_uid = await (await self._get_supervisor_ref()).launch_builtin_model(
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/pool.py", line 656, in send
result = await self._run_coro(message.message_id, coro)
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/pool.py", line 367, in _run_coro
return await coro
File "/usr/local/lib/python3.10/dist-packages/xoscar/api.py", line 384, in on_receive
return await super().on_receive(message) # type: ignore
File "xoscar/core.pyx", line 558, in on_receive
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.on_receive
result = await result
File "/usr/local/lib/python3.10/dist-packages/xinference/core/supervisor.py", line 988, in launch_builtin_model
await _launch_model()
File "/usr/local/lib/python3.10/dist-packages/xinference/core/supervisor.py", line 952, in _launch_model
await _launch_one_model(rep_model_uid)
File "/usr/local/lib/python3.10/dist-packages/xinference/core/supervisor.py", line 932, in _launch_one_model
await worker_ref.launch_builtin_model(
File "xoscar/core.pyx", line 284, in __pyx_actor_method_wrapper
async with lock:
File "xoscar/core.pyx", line 287, in xoscar.core.__pyx_actor_method_wrapper
result = await result
File "/usr/local/lib/python3.10/dist-packages/xinference/core/utils.py", line 45, in wrapped
ret = await func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/xinference/core/worker.py", line 841, in launch_builtin_model
await model_ref.load()
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 230, in send
result = await self._wait(future, actor_ref.address, send_message) # type: ignore
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 115, in _wait
return await future
File "xoscar/serialization/core.pyx", line 933, in xoscar.serialization.core.deserialize
deserialized = _deserial_single(
File "xoscar/serialization/core.pyx", line 831, in xoscar.serialization.core._deserial_single
res = serializer.deserial(serialized[_COMMON_HEADER_LEN:], context, subs)
File "xoscar/serialization/core.pyx", line 106, in xoscar.serialization.core.Serializer.deserial
cpdef deserial(self, tuple serialized, dict context, list subs):
File "/usr/local/lib/python3.10/dist-packages/xoscar/serialization/exception.py", line 45, in deserial
return unpickle_buffers(subs)
File "xoscar/serialization/core.pyx", line 261, in xoscar.serialization.core.unpickle_buffers
result = cloudpickle.loads(buffers[0], buffers=buffers[1:])
TypeError: [address=0.0.0.0:34148, pid=140] _extractNVMLErrorsAsClasses..gen_new..new() takes 1 positional argument but 2 were given
Expected behavior / 期待表现
正常运行
The text was updated successfully, but these errors were encountered: