imoneoi
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎ochat/config/__init__.py
+43 b/‎ochat/config/__init__.py
+43
diff --git a/‎ochat/config/conversation_template.py
+116 b/‎ochat/config/conversation_template.py
+116
diff --git a/‎ochat/config/model_config.py
+8-214 b/‎ochat/config/model_config.py
+8-214
@@ -6,6 +6,8 @@ wandb/
 
 # Old
 old/
+temp/
+profiler/
 
 # Logs
 logs/
 
@@ -0,0 +1,43 @@
+from functools import partial
+
+import torch
+import transformers
+
+from ochat.config.model_config import ModelConfig
+from ochat.config.conversation_template import Message, Conversation, ConversationTemplate
+import ochat.models
+
+
+_V3_2_PREFIXES = {
+    # ShareGPT & OpenAI mapping
+
+    "human": "User:",
+    "user": "User:",
+    "gpt": "Assistant:",
+    "assistant": "Assistant:"
+}
+
+
+def _v3_2_role_prefix(from_role, condition):
+    return f"{condition} {_V3_2_PREFIXES[from_role]}".strip()
+
+
+MODEL_CONFIG_MAP = {
+    # OpenChat V3.2
+    "openchat_v3.2": ModelConfig(
+        # Model
+        model_max_context=4096,
+        model_tokenizer_create=partial(transformers.AutoTokenizer.from_pretrained,
+                                       use_fast=False,
+                                       legacy=False),
+        model_create_for_training=partial(ochat.models.LlamaForCausalLM.from_pretrained,
+                                          low_cpu_mem_usage=True,
+                                          torch_dtype=torch.bfloat16),
+
+        # Conversation Template
+        conversation_template=partial(ConversationTemplate,
+                                      role_prefix=_v3_2_role_prefix,
+                                      eot="<|end_of_turn|>",
+                                      inference_condition="GPT4")
+    )
+}
@@ -0,0 +1,116 @@
+from typing import Optional, Callable, Iterable, List, Dict
+import re
+
+from pydantic import BaseModel, Field
+
+
+class Message(BaseModel):
+    role: str = Field(..., alias="from")
+    value: str
+
+    weight: Optional[float] = None
+
+
+class Conversation(BaseModel):
+    items: List[Message]
+
+    condition: Optional[str] = None
+    system: str = ""
+
+
+class ConversationTemplate(BaseModel):
+    tokenizer: Callable
+
+    # Prompt
+    role_prefix: Callable
+    eot: str
+
+    inference_condition: Optional[str] = None
+
+    # Private
+    bos_tokens_: List[int]
+    eot_tokens_: List[int]
+
+    def __init__(self, **data):
+        tokenizer = data["tokenizer"]
+        eot = data["eot"]
+        bos_tokens_ = tokenizer("").input_ids
+        eot_tokens_ = tokenizer(eot, add_special_tokens=False).input_ids
+
+        super().__init__(**data, bos_tokens_=bos_tokens_, eot_tokens_=eot_tokens_)
+
+    def safe_tokenize(self, strings: Iterable[str]) -> List[List[int]]:
+        return self.tokenizer(strings, split_special_tokens=True, return_attention_mask=False, add_special_tokens=False).input_ids
+
+    def tokenize_conversations(self, conversations: Iterable[Conversation], inference: bool = False):
+        # Pre-tokenize all conversations
+        default_condition = self.inference_condition if inference else None
+
+        sys_mappings = set()
+        role_mappings = set()
+        all_text = []
+        for conv in conversations:
+            sys_mappings.add(conv.system)
+            for msg in conv.items:
+                role_mappings.add((msg.role, conv.condition or default_condition))
+                all_text.append(msg.value)
+
+        sys_mappings = list(sys_mappings)
+        role_mappings = list(role_mappings)
+
+        # Tokenize
+        sys_mappings = dict(zip(sys_mappings, self.safe_tokenize(sys_mappings)))
+        role_mappings = dict(zip(role_mappings, self.safe_tokenize([self.role_prefix(*args) for args in role_mappings])))
+        all_text = self.safe_tokenize(all_text)
+
+        # Convert
+        result_tokens = []
+        result_weights = []
+        all_text_idx = 0
+        for conv in conversations:
+            tokens = []
+            weights = []
+
+            # bos tokens
+            tokens.extend(self.bos_tokens_)
+            weights.extend([0.] * len(self.bos_tokens_))
+
+            # System
+            if conv.system:
+                system = sys_mappings[conv.system]
+                tokens.extend(system)
+                weights.extend([0.] * len(system))
+
+                tokens.extend(self.eot_tokens_)
+                weights.extend([0.] * len(self.eot_tokens_))
+
+            # Messages
+            last_idx = len(conv.items) - 1
+            for idx, msg in enumerate(conv.items):
+                # Prefix
+                role = role_mappings[(msg.role, conv.condition or default_condition)]
+                tokens.extend(role)
+                weights.extend([0.] * len(role))
+
+                # Message
+                text = all_text[all_text_idx]
+                all_text_idx += 1
+
+                if not inference:
+                    assert msg.weight is not None
+
+                tokens.extend(text)
+                weights.extend([msg.weight] * len(text))
+
+                if not (inference and idx == last_idx):  # Do not add EOT on last turn during inference
+                    tokens.extend(self.eot_tokens_)
+                    weights.extend([msg.weight] * len(self.eot_tokens_))
+
+            # Append result
+            result_tokens.append(tokens)
+            result_weights.append(weights)
+
+        # Sanity check
+        assert all_text_idx == len(all_text)
+
+        return result_tokens, result_weights
@@ -1,219 +1,13 @@
-from typing import Optional, Callable, Union
-from dataclasses import dataclass
-from functools import partial
+from typing import Callable
 
-import torch
-import transformers
-import ochat.models
+from pydantic import BaseModel
 
 
-@dataclass
-class ModelConfig:
-    name: str
-
-    # Prompt
-    role_prefix: Union[dict, Callable]
-    ai_role: str
-    eot_token: str
-    bos_token: Optional[str] = None
-
-    condition_fn: Optional[Callable] = None
-
+class ModelConfig(BaseModel):
     # Model
-    model_max_context: Optional[int] = None
-    model_tokenizer_create: Optional[Callable] = None
-    model_create_for_training: Optional[Callable] = None
-
-    # Get template
-    def generate_conversation_template(self, tokenize_fn, tokenize_special_fn, system_prompt, message_list, message_props=None):
-        tokens = []
-        masks = []
-        weights = []
-
-        # begin of sentence (bos)
-        if self.bos_token:
-            t = tokenize_special_fn(self.bos_token)
-
-            tokens.extend([t])
-            masks.extend([False])
-            weights.extend([0.])
-
-        # Condition
-        if self.condition_fn is not None:
-            t = tokenize_fn(self.condition_fn(message_props)) + [tokenize_special_fn(self.eot_token)]
-
-            tokens.extend(t)
-            masks.extend([False] * len(t))
-            weights.extend([0.] * len(t))
-
-        # System
-        if system_prompt:
-            t = tokenize_fn(system_prompt) + [tokenize_special_fn(self.eot_token)]
-
-            tokens.extend(t)
-            masks.extend([False] * len(t))
-            weights.extend([0.] * len(t))
-
-        # Messages
-        for idx, message in enumerate(message_list):
-            # Prefix
-            if callable(self.role_prefix):
-                role_prefix = self.role_prefix(message["from"], message_props)
-            else:
-                role_prefix = self.role_prefix[message["from"]]
-
-            t = tokenize_fn(role_prefix)
-            tokens.extend(t)
-            masks.extend([False] * len(t))
-            weights.extend([0.] * len(t))
-
-            # Message
-            if "value" in message:
-                t = tokenize_fn(message["value"]) + [tokenize_special_fn(self.eot_token)]
-                
-                # determine weights
-                use_loss = (message["from"] == self.ai_role) and bool(message.get("use_loss", True))
-                w = 1.0 if use_loss else 0.0
-                
-                if message_props is not None and ("weight" in message_props):
-                    w *= message_props["weight"]
-
-                tokens.extend(t)
-                masks.extend([use_loss] * len(t))
-                weights.extend([w]      * len(t))
-            else:
-                assert idx == len(message_list) - 1, "Empty message for completion must be on the last."
-
-        return tokens, masks, weights
-
-
-def _v2_conditional_prefix(from_role, props):
-    human_prefix = "User:"
-    gpt4_prefix  = "Assistant GPT4:"
-    other_prefix = "Assistant GPT3:"
-
-    if from_role == "human":
-        return human_prefix
-    
-    if from_role == "gpt":
-        if props is None:
-            return gpt4_prefix  # inference using gpt-4 prefix
-        
-        return gpt4_prefix if props["is_gpt4"] else other_prefix
-    
-    raise NotImplementedError(f"Unknown role {from_role}")
-
-
-def _v3_2_conditional_prefix(from_role, props):
-    gpt3_prefixes = {
-        "human": "GPT3 User:",
-        "gpt": "GPT3 Assistant:"
-    }
-    gpt4_prefixes = {
-        "human": "GPT4 User:",
-        "gpt": "GPT4 Assistant:"
-    }
-    prefixes = gpt4_prefixes if props is None or props["is_gpt4"] else gpt3_prefixes
-
-    return prefixes[from_role]
-
-
-def _v3_condition(props):
-    gpt4_condition = "Assistant is GPT4"
-    gpt3_condition = "Assistant is GPT3"
-
-    if props is None:
-        return gpt4_condition
-
-    return gpt4_condition if props["is_gpt4"] else gpt3_condition
-
-
-MODEL_CONFIG_MAP = {
-    ################# Llama 2 based models
-    # OpenChat V3.2
-    "openchat_v3.2": ModelConfig(
-        name="OpenChat V3.2 Llama 2",
-
-        # Prompt
-        role_prefix=_v3_2_conditional_prefix,
-        ai_role="gpt",
-        eot_token="<|end_of_turn|>",
-        bos_token="<s>",
-
-        # Tokenize
-        model_max_context=4096,
-        model_tokenizer_create=partial(transformers.AutoTokenizer.from_pretrained,
-                                       use_fast=False,
-                                       legacy=True),
-        model_create_for_training=partial(ochat.models.LlamaForCausalLM.from_pretrained,
-                                          low_cpu_mem_usage=True,
-                                          torch_dtype=torch.bfloat16),
-    ),
-
-    "openchat_v3.1_llama2": ModelConfig(
-        name="OpenChat V3.1 Llama 2",
-
-        # Prompt
-        role_prefix={
-            "human": "User:",
-            "gpt": "Assistant:"
-        },
-        ai_role="gpt",
-        eot_token="<|end_of_turn|>",
-        bos_token="<s>",
-
-        condition_fn=_v3_condition,
-
-        # Tokenize
-        model_max_context=4096,
-        model_tokenizer_create=partial(transformers.AutoTokenizer.from_pretrained,
-                                       use_fast=False,
-                                       legacy=True),
-        model_create_for_training=partial(ochat.models.LlamaForCausalLM.from_pretrained,
-                                          low_cpu_mem_usage=True,
-                                          torch_dtype=torch.bfloat16),
-    ),
-
-    # OpenChat V2
-    "openchat_v2_llama2": ModelConfig(
-        name="OpenChat V2 Llama 2",
-
-        # Prompt
-        role_prefix=_v2_conditional_prefix,
-        ai_role="gpt",
-        eot_token="<|end_of_turn|>",
-        bos_token="<s>",
-
-        # Tokenize
-        model_max_context=4096,
-        model_tokenizer_create=partial(transformers.AutoTokenizer.from_pretrained,
-                                       use_fast=False,
-                                       legacy=True),
-        model_create_for_training=partial(ochat.models.LlamaForCausalLM.from_pretrained,
-                                          low_cpu_mem_usage=True,
-                                          torch_dtype=torch.bfloat16),
-    ),
-
-    # OpenChat
-    "openchat_llama2": ModelConfig(
-        name="OpenChat V1 Llama 2",
-
-        # Prompt
-        role_prefix={
-            "human": "User:",
-            "gpt": "Assistant:"
-        },
-        ai_role="gpt",
-        eot_token="<|end_of_turn|>",
-        bos_token="<s>",
+    model_max_context: int
+    model_tokenizer_create: Callable
+    model_create_for_training: Callable
 
-        # Tokenize
-        model_max_context=4096,
-        model_tokenizer_create=partial(transformers.AutoTokenizer.from_pretrained,
-                                       use_fast=False,
-                                       legacy=True),
-        model_create_for_training=partial(ochat.models.LlamaForCausalLM.from_pretrained,
-                                          low_cpu_mem_usage=True,
-                                          torch_dtype=torch.bfloat16),
-    )
-}
+    # conversation template
+    conversation_template: Callable
-Original file line number
+Diff line change
 # Old
 old/
 +temp/
 +profiler/
 # Logs
 logs/