You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
(janus) C:\Users\ASUS\anaconda3\Janus>python -c "from transformers import AutoModelForCausalLM; print(AutoModelForCausalLM.from_pretrained('deepseek-ai/Janus-Pro-1B'))"
Traceback (most recent call last):
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\transformers\models\auto\configuration_auto.py", line 1071, in from_pretrained
config_class = CONFIG_MAPPING[config_dict["model_type"]]
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\transformers\models\auto\configuration_auto.py", line 773, in getitem
raise KeyError(key)
KeyError: 'multi_modality'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "", line 1, in
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\transformers\models\auto\auto_factory.py", line 526, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\transformers\models\auto\configuration_auto.py", line 1073, in from_pretrained
raise ValueError(
ValueError: The checkpoint you are trying to load has model type multi_modality but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.
You can update Transformers with the command pip install --upgrade transformers. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the command pip install git+https://github.com/huggingface/transformers.git
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
(janus) C:\Users\ASUS\anaconda3\Janus>python demo/app_januspro.py
Python version is above 3.10, patching the collections module.
C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\transformers\models\auto\image_processing_auto.py:590: FutureWarning: The image_processor_class argument is deprecated and will be removed in v4.42. Please use slow_image_processor_class, or fast_image_processor_class instead
warnings.warn(
Using a slow image processor as use_fast is unset and a slow processor was saved with this model. use_fast=True will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with use_fast=False.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the legacy (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set legacy=False. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in huggingface/transformers#24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
Some kwargs in processor config are unused and will not have any effect: add_special_token, mask_prompt, sft_format, image_tag, ignore_id, num_image_tokens.
Running on local URL: http://127.0.0.1:7860
To create a public link, set share=True in launch().
IMPORTANT: You are using gradio version 3.48.0, however version 4.44.1 is available, please upgrade.
Traceback (most recent call last):
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\gradio\routes.py", line 534, in predict
output = await route_utils.call_process_api(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\gradio\route_utils.py", line 226, in call_process_api
output = await app.get_blocks().process_api(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\gradio\blocks.py", line 1550, in process_api
result = await self.call_function(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\gradio\blocks.py", line 1185, in call_function
prediction = await anyio.to_thread.run_sync(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\anyio_backends_asyncio.py", line 2461, in run_sync_in_worker_thread
return await future
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\anyio_backends_asyncio.py", line 962, in run
result = context.run(func, *args)
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\gradio\utils.py", line 661, in wrapper
response = f(*args, **kwargs)
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\torch\utils_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "C:\Users\ASUS\anaconda3\Janus\demo\app_januspro.py", line 65, in multimodal_understanding
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
TypeError: MultiModalityCausalLM.prepare_inputs_embeds() missing 4 required positional arguments: 'input_ids', 'pixel_values', 'images_seq_mask', and 'images_emb_mask'
And this is the code (app_januspro.py):
import gradio as gr
from transformers import AutoConfig, AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
from PIL import Image
import torch
import numpy as np
import os
import time
import spaces # Import spaces for ZeroGPU compatibility
@torch.inference_mode()
def multimodal_understanding(image, question, seed, top_p, temperature):
# Set default seed if none is provided
if seed is None:
seed = 42 # Default value, you can change it
# Clear CUDA cache before generating
torch.cuda.empty_cache()
# set seed
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
conversation = {
"role": "<|User|>",
"content": f"<image_placeholder>\n{question}",
"images": [image],
}
pil_images = [Image.fromarray(image)]
# Assurer que les biais soient du même type que les entrées
prepare_inputs = conversation # This should be a dictionary, not a list
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
print(f"Input type: {inputs_embeds.dtype}") # Vérifie le type ici après la préparation des inputs
outputs = vl_gpt.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=tokenizer.eos_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=512,
do_sample=False if temperature == 0 else True,
use_cache=True,
temperature=temperature,
top_p=top_p,
)
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer
def generate(input_ids,
width,
height,
temperature: float = 1,
parallel_size: int = 5,
cfg_weight: float = 5,
image_token_num_per_image: int = 576,
patch_size: int = 16):
# Clear CUDA cache before generating
torch.cuda.empty_cache()
@spaces.GPU(duration=120) # Specify a duration to avoid timeout
def generate_image(prompt,
seed=None,
guidance=5,
t2i_temperature=1.0):
# Clear CUDA cache and avoid tracking gradients
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True # Évite l'allocation dynamique de mémoire
# Set the seed for reproducible results
if seed is not None:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
width = 384
height = 384
parallel_size = 5
with torch.no_grad():
messages = [{'role': '<|User|>', 'content': prompt},
{'role': '<|Assistant|>', 'content': ''}]
text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
sft_format=vl_chat_processor.sft_format,
system_prompt='')
text = text + vl_chat_processor.image_start_tag
input_ids = torch.LongTensor(tokenizer.encode(text))
output, patches = generate(input_ids,
width // 16 * 16,
height // 16 * 16,
cfg_weight=guidance,
parallel_size=parallel_size,
temperature=t2i_temperature)
images = unpack(patches,
width // 16 * 16,
height // 16 * 16,
parallel_size=parallel_size)
return [Image.fromarray(images[i]).resize((768, 768), Image.LANCZOS) for i in range(parallel_size)]
Gradio interface
with gr.Blocks() as demo:
gr.Markdown(value="# Multimodal Understanding")
with gr.Row():
image_input = gr.Image()
with gr.Column():
question_input = gr.Textbox(label="Question")
und_seed_input = gr.Number(label="Seed", precision=0, value=42)
top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
understanding_button = gr.Button("Chat")
understanding_output = gr.Textbox(label="Response")
examples_inpainting = gr.Examples(
label="Multimodal Understanding examples",
examples=[
[
"explain this meme",
"images/doge.png",
],
[
"Convert the formula into latex code.",
"images/equation.png",
],
],
inputs=[question_input, image_input],
)
gr.Markdown(value="# Text-to-Image Generation")
with gr.Row():
cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
prompt_input = gr.Textbox(label="Prompt. (Prompt in more detail can help produce better images!)")
seed_input = gr.Number(label="Seed (Optional)", precision=0, value=12345)
generation_button = gr.Button("Generate Images")
image_output = gr.Gallery(label="Generated Images", columns=2, rows=2, height=300)
examples_t2i = gr.Examples(
label="Text to image generation examples.",
examples=[
"Master shifu racoon wearing drip attire as a street gangster.",
"The face of a beautiful girl",
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
"A glass of red wine on a reflective surface.",
"A cute and adorable baby fox with big brown eyes, autumn leaves in the background enchanting,immortal,fluffy, shiny mane,Petals,fairyism,unreal engine 5 and Octane Render,highly detailed, photorealistic, cinematic, natural colors.",
"The image features an intricately designed eye set against a circular backdrop adorned with ornate swirl patterns that evoke both realism and surrealism. At the center of attention is a strikingly vivid blue iris surrounded by delicate veins radiating outward from the pupil to create depth and intensity. The eyelashes are long and dark, casting subtle shadows on the skin around them which appears smooth yet slightly textured as if aged or weathered over time.\n\nAbove the eye, there's a stone-like structure resembling part of classical architecture, adding layers of mystery and timeless elegance to the composition. This architectural element contrasts sharply but harmoniously with the organic curves surrounding it. Below the eye lies another decorative motif reminiscent of baroque artistry, further enhancing the overall sense of eternity encapsulated within each meticulously crafted detail. \n\nOverall, the atmosphere exudes a mysterious aura intertwined seamlessly with elements suggesting timelessness, achieved through the juxtaposition of realistic textures and surreal artistic flourishes. Each component\u2014from the intricate designs framing the eye to the ancient-looking stone piece above\u2014contributes uniquely towards creating a visually captivating tableau imbued with enigmatic allure.",
],
inputs=prompt_input,
)
understanding_button.click(
multimodal_understanding,
inputs=[image_input, question_input, und_seed_input, top_p, temperature],
outputs=understanding_output
)
generation_button.click(
fn=generate_image,
inputs=[prompt_input, seed_input, cfg_weight_input, t2i_temperature],
outputs=image_output
)
helppp!!!!!!!!!!!!!!!!!!
(janus) C:\Users\ASUS\anaconda3\Janus>python --version
Python 3.10.16
(janus) C:\Users\ASUS\anaconda3\Janus>python -c "import torch; print(torch.version)"
2.6.0+cu118
(janus) C:\Users\ASUS\anaconda3\Janus>python -c "import transformers; print(transformers.version)"
4.48.3
(janus) C:\Users\ASUS\anaconda3\Janus>python -c "import gradio; print(gradio.version)"
3.48.0
(janus) C:\Users\ASUS\anaconda3\Janus>python -c "from transformers import AutoModelForCausalLM; print(AutoModelForCausalLM.from_pretrained('deepseek-ai/Janus-Pro-1B'))"
Traceback (most recent call last):
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\transformers\models\auto\configuration_auto.py", line 1071, in from_pretrained
config_class = CONFIG_MAPPING[config_dict["model_type"]]
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\transformers\models\auto\configuration_auto.py", line 773, in getitem
raise KeyError(key)
KeyError: 'multi_modality'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "", line 1, in
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\transformers\models\auto\auto_factory.py", line 526, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\transformers\models\auto\configuration_auto.py", line 1073, in from_pretrained
raise ValueError(
ValueError: The checkpoint you are trying to load has model type
multi_modality
but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.You can update Transformers with the command
pip install --upgrade transformers
. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the commandpip install git+https://github.com/huggingface/transformers.git
(janus) C:\Users\ASUS\anaconda3\Janus>nvidia-smi
Sat Feb 15 12:19:04 2025
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.78 Driver Version: 512.78 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... WDDM | 00000000:01:00.0 Off | N/A |
| N/A 38C P8 4W / N/A | 0MiB / 4096MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
(janus) C:\Users\ASUS\anaconda3\Janus>python -c "import torch; print(torch.float32, torch.float16)"
torch.float32 torch.float16
(janus) C:\Users\ASUS\anaconda3\Janus>python demo/app_januspro.py
Python version is above 3.10, patching the collections module.
C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\transformers\models\auto\image_processing_auto.py:590: FutureWarning: The image_processor_class argument is deprecated and will be removed in v4.42. Please use
slow_image_processor_class
, orfast_image_processor_class
insteadwarnings.warn(
Using a slow image processor as
use_fast
is unset and a slow processor was saved with this model.use_fast=True
will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor withuse_fast=False
.You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the
legacy
(previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, setlegacy=False
. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in huggingface/transformers#24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.Some kwargs in processor config are unused and will not have any effect: add_special_token, mask_prompt, sft_format, image_tag, ignore_id, num_image_tokens.
Running on local URL: http://127.0.0.1:7860
To create a public link, set
share=True
inlaunch()
.IMPORTANT: You are using gradio version 3.48.0, however version 4.44.1 is available, please upgrade.
Traceback (most recent call last):
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\gradio\routes.py", line 534, in predict
output = await route_utils.call_process_api(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\gradio\route_utils.py", line 226, in call_process_api
output = await app.get_blocks().process_api(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\gradio\blocks.py", line 1550, in process_api
result = await self.call_function(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\gradio\blocks.py", line 1185, in call_function
prediction = await anyio.to_thread.run_sync(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\anyio_backends_asyncio.py", line 2461, in run_sync_in_worker_thread
return await future
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\anyio_backends_asyncio.py", line 962, in run
result = context.run(func, *args)
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\gradio\utils.py", line 661, in wrapper
response = f(*args, **kwargs)
File "C:\Users\ASUS\anaconda3\envs\janus\lib\site-packages\torch\utils_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "C:\Users\ASUS\anaconda3\Janus\demo\app_januspro.py", line 65, in multimodal_understanding
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
TypeError: MultiModalityCausalLM.prepare_inputs_embeds() missing 4 required positional arguments: 'input_ids', 'pixel_values', 'images_seq_mask', and 'images_emb_mask'
And this is the code (app_januspro.py):
import gradio as gr
from transformers import AutoConfig, AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
from PIL import Image
import torch
import numpy as np
import os
import time
import spaces # Import spaces for ZeroGPU compatibility
Load model and processor
model_path = "deepseek-ai/Janus-Pro-1B"
config = AutoConfig.from_pretrained(model_path)
language_config = config.language_config
language_config._attn_implementation = 'eager'
vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
language_config=language_config,
trust_remote_code=True,
low_cpu_mem_usage=True
)
if torch.cuda.is_available():
vl_gpt = vl_gpt.to(torch.float32) # Utilise le CPU au lieu du GPU
else:
vl_gpt = vl_gpt.to(torch.float16)
vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
@spaces.GPU(duration=120)
Multimodal Understanding function
@torch.inference_mode()
def multimodal_understanding(image, question, seed, top_p, temperature):
# Set default seed if none is provided
if seed is None:
seed = 42 # Default value, you can change it
def generate(input_ids,
width,
height,
temperature: float = 1,
parallel_size: int = 5,
cfg_weight: float = 5,
image_token_num_per_image: int = 576,
patch_size: int = 16):
# Clear CUDA cache before generating
torch.cuda.empty_cache()
def unpack(dec, width, height, parallel_size=5):
dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
dec = np.clip((dec + 1) / 2 * 255, 0, 255)
@torch.inference_mode()
@spaces.GPU(duration=120) # Specify a duration to avoid timeout
def generate_image(prompt,
seed=None,
guidance=5,
t2i_temperature=1.0):
# Clear CUDA cache and avoid tracking gradients
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True # Évite l'allocation dynamique de mémoire
Gradio interface
with gr.Blocks() as demo:
gr.Markdown(value="# Multimodal Understanding")
with gr.Row():
image_input = gr.Image()
with gr.Column():
question_input = gr.Textbox(label="Question")
und_seed_input = gr.Number(label="Seed", precision=0, value=42)
top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
demo.launch()
demo.queue(concurrency_count=1, max_size=10).launch(server_name="0.0.0.0", server_port=37906, root_path="/path")
The text was updated successfully, but these errors were encountered: