diff --git a/.gitignore b/.gitignore index 594e5e6..37e8560 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ + +# Mac finder directory settings +.DS_Store \ No newline at end of file diff --git a/gradio_app.py b/gradio_app.py index d0eecd8..61e26cd 100644 --- a/gradio_app.py +++ b/gradio_app.py @@ -1,4 +1,7 @@ import os +import platform +is_mac = platform.system() == 'Darwin' +from huggingface_hub import snapshot_download os.environ['HF_HOME'] = os.path.join(os.path.dirname(__file__), 'hf_download') HF_TOKEN = None @@ -11,11 +14,12 @@ import gradio as gr import tempfile +from openai import OpenAI +import subprocess + gradio_temp_dir = os.path.join(tempfile.gettempdir(), 'gradio') os.makedirs(gradio_temp_dir, exist_ok=True) -from threading import Thread - # Phi3 Hijack from transformers.models.phi3.modeling_phi3 import Phi3PreTrainedModel @@ -32,6 +36,9 @@ import lib_omost.canvas as omost_canvas +# https://medium.com/@natsunoyuki/using-civitai-models-with-diffusers-package-45e0c475a67e +# https://huggingface.co/docs/diffusers/en/api/loaders/single_file +# https://github.com/huggingface/diffusers/blob/v0.28.0/scripts/convert_original_stable_diffusion_to_diffusers.py # SDXL @@ -66,25 +73,49 @@ memory_management.unload_all_models([text_encoder, text_encoder_2, vae, unet]) +openai_api_base = "http://127.0.0.1:8080/v1" +client = OpenAI(api_key="EMPTY", base_url=openai_api_base) + # LLM +# llm_name = "mlx-community/Phi-3-mini-128k-instruct-8bit" +llm_name = "mlx-community/Meta-Llama-3-8B-4bit" +# llm_name = "mlx-community/dolphin-2.9.1-llama-3-8b-4bit" -# llm_name = 'lllyasviel/omost-phi-3-mini-128k-8bits' -llm_name = 'lllyasviel/omost-llama-3-8b-4bits' -# llm_name = 'lllyasviel/omost-dolphin-2.9-llama3-8b-4bits' +def load_model(model_name): + global process -llm_model = AutoModelForCausalLM.from_pretrained( - llm_name, - torch_dtype=torch.bfloat16, # This is computation type, not load/memory type. The loading quant type is baked in config. - token=HF_TOKEN, - device_map="auto" # This will load model to gpu with an offload system -) + local_model_dir = os.path.join( + os.environ['HF_HOME'], llm_name.split("/")[1] + ) -llm_tokenizer = AutoTokenizer.from_pretrained( - llm_name, - token=HF_TOKEN -) + if not os.path.exists(local_model_dir): + snapshot_download(repo_id=llm_name, local_dir=local_model_dir) + + command = ["python3", "-m", "mlx_lm.server", "--model", local_model_dir] + + try: + process = subprocess.Popen( + command, stdin=subprocess.PIPE, stderr=subprocess.PIPE, text=True + ) + process.stdin.write("y\n") + process.stdin.flush() + print("Model Loaded") + return True #{model_status: "Model Loaded"} + except Exception as e: + print(f"Exception occurred: {str(e)}") + return False #{model_status: f"Exception occurred: {str(e)}"} + +load_model(llm_name) + +def kill_process(): + global process + process.terminate() + time.sleep(2) + if process.poll() is None: # Check if the process has indeed terminated + process.kill() # Force kill if still running -memory_management.unload_all_models(llm_model) + print("Model Killed") + return {model_status: "Model Unloaded"} @torch.inference_mode() @@ -110,7 +141,6 @@ def resize_without_crop(image, target_width, target_height): resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS) return np.array(resized_image) - @torch.inference_mode() def chat_fn(message: str, history: list, seed:int, temperature: float, top_p: float, max_new_tokens: int) -> str: np.random.seed(int(seed)) @@ -125,49 +155,26 @@ def chat_fn(message: str, history: list, seed:int, temperature: float, top_p: fl conversation.append({"role": "user", "content": message}) - memory_management.load_models_to_gpu(llm_model) - - input_ids = llm_tokenizer.apply_chat_template( - conversation, return_tensors="pt", add_generation_prompt=True).to(llm_model.device) - - streamer = TextIteratorStreamer(llm_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) - - def interactive_stopping_criteria(*args, **kwargs) -> bool: - if getattr(streamer, 'user_interrupted', False): - print('User stopped generation') - return True - else: - return False - - stopping_criteria = StoppingCriteriaList([interactive_stopping_criteria]) - - def interrupter(): - streamer.user_interrupted = True - return - - generate_kwargs = dict( - input_ids=input_ids, - streamer=streamer, - stopping_criteria=stopping_criteria, - max_new_tokens=max_new_tokens, - do_sample=True, + response = client.chat.completions.create( + model="gpt", + messages=conversation, temperature=temperature, top_p=top_p, + # frequency_penalty=freq_penalty, + max_tokens=max_new_tokens, + stream=True, ) - - if temperature == 0: - generate_kwargs['do_sample'] = False - - Thread(target=llm_model.generate, kwargs=generate_kwargs).start() - - outputs = [] - for text in streamer: - outputs.append(text) - # print(outputs) - yield "".join(outputs), interrupter - - return - + stop = ["<|im_end|>", "<|endoftext|>"] + partial_message = "" + for chunk in response: + if len(chunk.choices) != 0: + if chunk.choices[0].delta.content not in stop: + partial_message = partial_message + chunk.choices[0].delta.content + else: + partial_message = partial_message + "" + yield partial_message + + return partial_message @torch.inference_mode() def post_chat(history): diff --git a/lib_omost/memory_management.py b/lib_omost/memory_management.py index 05ed6d0..6d4c1ae 100644 --- a/lib_omost/memory_management.py +++ b/lib_omost/memory_management.py @@ -1,13 +1,19 @@ import torch from contextlib import contextmanager +import platform +is_mac = platform.system() == 'Darwin' high_vram = False -gpu = torch.device('cuda') +if is_mac: + gpu = torch.device('mps') +else: + gpu = torch.device('cuda') cpu = torch.device('cpu') torch.zeros((1, 1)).to(gpu, torch.float32) -torch.cuda.empty_cache() + +torch.cuda.empty_cache() if not is_mac else torch.mps.empty_cache() models_in_gpu = [] @@ -27,6 +33,8 @@ def movable_bnb_model(m): def load_models_to_gpu(models): + if is_mac: return + global models_in_gpu if not isinstance(models, (tuple, list)): @@ -49,11 +57,13 @@ def load_models_to_gpu(models): print('Load to GPU:', m.__class__.__name__) models_in_gpu = list(set(models_in_gpu + models)) - torch.cuda.empty_cache() + torch.cuda.empty_cache() if not is_mac else torch.mps.empty_cache() return def unload_all_models(extra_models=None): + if is_mac: return + global models_in_gpu if extra_models is None: diff --git a/mlx_lm_wrapper.py b/mlx_lm_wrapper.py new file mode 100644 index 0000000..b7fa86b --- /dev/null +++ b/mlx_lm_wrapper.py @@ -0,0 +1,89 @@ +from typing import Any, Callable, Dict, Generator, Optional, Tuple, Union +# from mlx_lm import load, PreTrainedTokenizer, TokenizerWrapper +import mlx +import mlx_lm +import transformers as tf +# from transformers import AutoTokenizer, TextIteratorStreamer +from transformers.generation.stopping_criteria import StoppingCriteriaList +from transformers.generation.utils import GenerateOutput +import numpy as np +import torch + +def load_mlx_lm(llm_name: str) -> Tuple[mlx.nn.Module, tf.PreTrainedTokenizer]: + llm_model, llm_tokenizer = mlx_lm.load(llm_name) + return MLX_LLM_TransformersWrapper(llm_model, llm_tokenizer), llm_tokenizer + +class MLX_LLM_TransformersWrapper(mlx.nn.Module): + def __init__(self, model: mlx.nn.Module, tokenizer: tf.PreTrainedTokenizer): + self.model = model + self.tokenizer = tokenizer + + def generate(self, + input_ids: np.ndarray, + streamer: tf.TextIteratorStreamer, #Optional["BaseStreamer"] = None, + # inputs: Optional[torch.Tensor] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_new_tokens: int = 100, + do_sample: bool = True, + temperature: float = 1.0, + top_p: float = 1.0, + **kwargs + ) -> Union[GenerateOutput, torch.LongTensor]: + + if streamer is not None: + streamer.put(input_ids.cpu()) + + # has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria) + # return self.__stream_generate(self.model, self.tokenizer, input_ids, max_new_tokens, **kwargs) + + + def __stream_generate(self, + model: torch.nn.Module, + tokenizer: tf.PreTrainedTokenizer, + prompt: Union[str, np.ndarray], + max_tokens: int = 100, + **kwargs, + ) -> Union[str, Generator[str, None, None]]: + """ + A generator producing text based on the given prompt from the model. + + Args: + prompt (mx.array): The input prompt. + model (nn.Module): The model to use for generation. + max_tokens (int): The ma + kwargs: The remaining options get passed to :func:`generate_step`. + See :func:`generate_step` for more details. + + Yields: + Generator[Tuple[mx.array, mx.array]]: A generator producing text. + """ + # if not isinstance(tokenizer, TokenizerWrapper): + # tokenizer = TokenizerWrapper(tokenizer) + + if isinstance(prompt, str): + prompt_tokens = mx.array(tokenizer.encode(prompt)) + else: + prompt_tokens = mx.array(prompt) + + detokenizer = tokenizer.detokenizer + detokenizer.reset() + print("generating...") + for (token, prob), n in zip( + generate_step( + prompt=prompt_tokens, + model=model, + temp=kwargs.pop("temperature", 1.0), + **kwargs), + range(max_tokens), + ): + print(f"n: {n}") + if token == tokenizer.eos_token_id: + print("EOS") + break + detokenizer.add_token(token) + print(f"Token: {token}") + # Yield the last segment if streaming + yield detokenizer.last_segment + + detokenizer.finalize() + yield detokenizer.last_segment diff --git a/requirements.txt b/requirements.txt index 96681b5..88b566b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ diffusers==0.28.0 transformers==4.41.1 gradio==4.31.5 -bitsandbytes==0.43.1 +mlx-lm==0.14.3; sys_platform == 'darwin' +bitsandbytes==0.43.1; sys_platform != 'darwin' accelerate==0.30.1 protobuf==3.20 opencv-python @@ -11,3 +12,4 @@ pillow einops torch peft +openai \ No newline at end of file