-
Notifications
You must be signed in to change notification settings - Fork 94
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
24 changed files
with
1,677 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
save_dir: "./experiment/" | ||
|
||
ablation: | ||
use_ablate: false | ||
|
||
# Data Ingestion ------------------- | ||
data: | ||
file_type: "huggingface" # one of 'json', 'csv', 'huggingface' | ||
path: "yahma/alpaca-cleaned" | ||
prompt: | ||
>- # prompt, make sure column inputs are enclosed in {} brackets and that they match your data | ||
Below is an instruction that describes a task. | ||
Write a response that appropriately completes the request. | ||
### Instruction: {instruction} | ||
### Input: {input} | ||
### Output: | ||
prompt_stub: | ||
>- # Stub to add for training at the end of prompt, for test set or inference, this is omitted; make sure only one variable is present | ||
{output} | ||
test_size: 0.1 # Proportion of test as % of total; if integer then # of samples | ||
train_size: 0.9 # Proportion of train as % of total; if integer then # of samples | ||
train_test_split_seed: 42 | ||
|
||
# Model Definition ------------------- | ||
model: | ||
hf_model_ckpt: "NousResearch/Llama-2-7b-hf" | ||
quantize: true | ||
bitsandbytes: | ||
load_in_4bit: true | ||
bnb_4bit_compute_dtype: "bfloat16" | ||
bnb_4bit_quant_type: "nf4" | ||
|
||
# LoRA Params ------------------- | ||
lora: | ||
task_type: "CAUSAL_LM" | ||
r: 32 | ||
lora_dropout: 0.1 | ||
target_modules: | ||
- q_proj | ||
- v_proj | ||
- k_proj | ||
- o_proj | ||
- up_proj | ||
- down_proj | ||
- gate_proj | ||
|
||
# Training ------------------- | ||
training: | ||
training_args: | ||
num_train_epochs: 5 | ||
per_device_train_batch_size: 4 | ||
gradient_accumulation_steps: 4 | ||
gradient_checkpointing: True | ||
optim: "paged_adamw_32bit" | ||
logging_steps: 100 | ||
learning_rate: 2.0e-4 | ||
bf16: true # Set to true for mixed precision training on Newer GPUs | ||
tf32: true | ||
# fp16: false # Set to true for mixed precision training on Older GPUs | ||
max_grad_norm: 0.3 | ||
warmup_ratio: 0.03 | ||
lr_scheduler_type: "constant" | ||
sft_args: | ||
max_seq_length: 5000 | ||
# neftune_noise_alpha: None | ||
|
||
inference: | ||
max_new_tokens: 1024 | ||
use_cache: True | ||
do_sample: True | ||
top_p: 0.9 | ||
temperature: 0.8 |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import os | ||
from os.path import join, exists | ||
from functools import partial | ||
from typing import Tuple, Union | ||
import pickle | ||
|
||
import re | ||
from datasets import Dataset | ||
from rich.console import Console | ||
from rich.layout import Layout | ||
from rich.panel import Panel | ||
|
||
from src.data.ingestor import Ingestor, get_ingestor | ||
|
||
|
||
class DatasetGenerator: | ||
def __init__( | ||
self, | ||
file_type: str, | ||
path: str, | ||
prompt: str, | ||
prompt_stub: str, | ||
test_size: Union[float, int], | ||
train_size: Union[float, int], | ||
train_test_split_seed: int, | ||
): | ||
self.ingestor: Ingestor = get_ingestor(file_type) | ||
self.ingestor: Ingestor = self.ingestor(path) | ||
|
||
self.dataset: Dataset = self.ingestor.to_dataset() | ||
self.prompt: str = prompt | ||
self.prompt_stub: str = prompt_stub | ||
self.test_size = test_size | ||
self.train_size = train_size | ||
self.train_test_split_seed: int = train_test_split_seed | ||
|
||
self.train_columns: list = self._get_train_columns() | ||
self.test_column: str = self._get_test_column() | ||
|
||
def _get_train_columns(self): | ||
pattern = r"\{([^}]*)\}" | ||
return re.findall(pattern, self.prompt) | ||
|
||
def _get_test_column(self): | ||
pattern = r"\{([^}]*)\}" | ||
return re.findall(pattern, self.prompt_stub)[0] | ||
|
||
# TODO: stratify_by_column | ||
def _train_test_split(self): | ||
self.dataset = self.dataset.train_test_split( | ||
test_size=self.test_size, | ||
train_size=self.train_size, | ||
seed=self.train_test_split_seed, | ||
) | ||
|
||
def _format_one_prompt(self, example, is_test: bool = False): | ||
train_mapping = {var_name: example[var_name] for var_name in self.train_columns} | ||
example["formatted_prompt"] = self.prompt.format(**train_mapping) | ||
|
||
if not is_test: | ||
test_mapping = {self.test_column: example[self.test_column]} | ||
example["formatted_prompt"] += self.prompt_stub.format(**test_mapping) | ||
|
||
return example | ||
|
||
def _format_prompts(self): | ||
self.dataset["train"] = self.dataset["train"].map( | ||
partial(self._format_one_prompt, is_test=False) | ||
) | ||
self.dataset["test"] = self.dataset["test"].map( | ||
partial(self._format_one_prompt, is_test=True) | ||
) | ||
|
||
def get_dataset(self) -> Tuple[Dataset, Dataset]: | ||
self._train_test_split() | ||
self._format_prompts() | ||
|
||
return self.dataset["train"], self.dataset["test"] | ||
|
||
def save_dataset(self, save_dir: str): | ||
os.makedirs(save_dir, exist_ok=True) | ||
with open(join(save_dir, "dataset.pkl"), "wb") as f: | ||
pickle.dump(self.dataset, f) | ||
|
||
def load_dataset_from_pickle(self, save_dir: str): | ||
data_path = join(save_dir, "dataset.pkl") | ||
|
||
if not exists(data_path): | ||
raise FileNotFoundError(f"Train set pickle not found at {save_dir}") | ||
|
||
with open(data_path, "rb") as f: | ||
data = pickle.load(f) | ||
self.dataset = data | ||
|
||
return self.dataset["train"], self.dataset["test"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
from abc import ABC, abstractmethod | ||
from functools import partial | ||
|
||
import ijson | ||
import csv | ||
from datasets import Dataset, load_dataset, concatenate_datasets | ||
|
||
|
||
def get_ingestor(data_type: str): | ||
if data_type == "json": | ||
return JsonIngestor | ||
elif data_type == "csv": | ||
return CsvIngestor | ||
elif data_type == "huggingface": | ||
return HuggingfaceIngestor | ||
else: | ||
raise ValueError( | ||
f"'type' must be one of 'json', 'csv', or 'huggingface', you have {data_type}" | ||
) | ||
|
||
|
||
class Ingestor(ABC): | ||
@abstractmethod | ||
def to_dataset(self) -> Dataset: | ||
pass | ||
|
||
|
||
class JsonIngestor(Ingestor): | ||
def __init__(self, path: str): | ||
self.path = path | ||
|
||
def _json_generator(self): | ||
with open(self.path, "rb") as f: | ||
for item in ijson.items(f, "item"): | ||
yield item | ||
|
||
def to_dataset(self) -> Dataset: | ||
return Dataset.from_generator(self._json_generator) | ||
|
||
|
||
class CsvIngestor(Ingestor): | ||
def __init__(self, path: str): | ||
self.path = path | ||
|
||
def _csv_generator(self): | ||
with open(self.path) as csvfile: | ||
reader = csv.DictReader(csvfile) | ||
for row in reader: | ||
yield row | ||
|
||
def to_dataset(self) -> Dataset: | ||
return Dataset.from_generator(self._csv_generator) | ||
|
||
|
||
class HuggingfaceIngestor(Ingestor): | ||
def __init__(self, path: str): | ||
self.path = path | ||
|
||
def to_dataset(self) -> Dataset: | ||
ds = load_dataset(self.path) | ||
return concatenate_datasets(ds.values()) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import Union, List, Tuple, Dict | ||
|
||
|
||
class Finetune(ABC): | ||
@abstractmethod | ||
def finetune(self): | ||
pass | ||
|
||
@abstractmethod | ||
def save_model(self): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
from os.path import join, exists | ||
from typing import Tuple | ||
|
||
import torch | ||
|
||
import bitsandbytes as bnb | ||
from datasets import Dataset | ||
from accelerate import Accelerator | ||
from transformers import ( | ||
AutoTokenizer, | ||
AutoModelForCausalLM, | ||
BitsAndBytesConfig, | ||
TrainingArguments, | ||
AutoTokenizer, | ||
ProgressCallback, | ||
) | ||
from peft import ( | ||
prepare_model_for_kbit_training, | ||
get_peft_model, | ||
LoraConfig, | ||
) | ||
from trl import SFTTrainer | ||
from rich.console import Console | ||
|
||
|
||
from src.pydantic_models.config_model import Config | ||
from src.utils.save_utils import DirectoryHelper | ||
from src.finetune.finetune import Finetune | ||
from src.ui.rich_ui import RichUI | ||
|
||
|
||
class LoRAFinetune(Finetune): | ||
def __init__(self, config: Config, directory_helper: DirectoryHelper): | ||
self.config = config | ||
|
||
self._model_config = config.model | ||
self._training_args = config.training.training_args | ||
self._sft_args = config.training.sft_args | ||
self._lora_config = LoraConfig(**config.lora.model_dump()) | ||
self._directory_helper = directory_helper | ||
self._weights_path = self._directory_helper.save_paths.weights | ||
self._trainer = None | ||
|
||
self.model = None | ||
self.tokenizer = None | ||
|
||
""" TODO: Figure out how to handle multi-gpu | ||
if config.accelerate: | ||
self.accelerator = Accelerator() | ||
self.accelerator.state.deepspeed_plugin.deepspeed_config[ | ||
"train_micro_batch_size_per_gpu" | ||
] = self.config.training.training_args.per_device_train_batch_size | ||
if config.accelerate: | ||
# device_index = Accelerator().process_index | ||
self.device_map = None #{"": device_index} | ||
else: | ||
""" | ||
self.device_map = self._model_config.device_map | ||
|
||
self._load_model_and_tokenizer() | ||
|
||
def _load_model_and_tokenizer(self): | ||
ckpt = self._model_config.hf_model_ckpt | ||
RichUI.on_basemodel_load(ckpt) | ||
model = self._get_model() | ||
tokenizer = self._get_tokenizer() | ||
RichUI.after_basemodel_load(ckpt) | ||
|
||
self.model = model | ||
self.tokenizer = tokenizer | ||
|
||
def _get_model(self): | ||
model = AutoModelForCausalLM.from_pretrained( | ||
self._model_config.hf_model_ckpt, | ||
quantization_config=( | ||
BitsAndBytesConfig(**self._model_config.bitsandbytes.model_dump()) | ||
if not self.config.accelerate | ||
else None | ||
), | ||
use_cache=False, | ||
device_map=self.device_map, | ||
) | ||
|
||
model.config.pretraining_tp = 1 | ||
|
||
return model | ||
|
||
def _get_tokenizer(self): | ||
tokenizer = AutoTokenizer.from_pretrained(self._model_config.hf_model_ckpt) | ||
tokenizer.pad_token = tokenizer.eos_token | ||
tokenizer.padding_side = "right" | ||
|
||
return tokenizer | ||
|
||
def _inject_lora(self): | ||
if not self.config.accelerate: | ||
self.model.gradient_checkpointing_enable() | ||
self.model = prepare_model_for_kbit_training(self.model) | ||
self.model = get_peft_model(self.model, self._lora_config) | ||
|
||
if not self.config.accelerate: | ||
self.optimizer = bnb.optim.Adam8bit( | ||
self.model.parameters(), lr=self._training_args.learning_rate | ||
) | ||
self.lr_scheduler = torch.optim.lr_scheduler.ConstantLR(self.optimizer) | ||
if self.config.accelerate: | ||
self.model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( | ||
self.model, self.optimizer, self.lr_scheduler | ||
) | ||
|
||
def finetune(self, train_dataset: Dataset): | ||
logging_dir = join(self._weights_path, "/logs") | ||
training_args = TrainingArguments( | ||
output_dir=self._weights_path, | ||
logging_dir=logging_dir, | ||
report_to="none", | ||
**self._training_args.model_dump(), | ||
) | ||
|
||
progress_callback = ProgressCallback() | ||
|
||
self._trainer = SFTTrainer( | ||
model=self.model, | ||
train_dataset=train_dataset, | ||
peft_config=self._lora_config, | ||
tokenizer=self.tokenizer, | ||
packing=True, | ||
args=training_args, | ||
dataset_text_field="formatted_prompt", # TODO: maybe move consts to a dedicated folder | ||
callbacks=[progress_callback], | ||
# optimizers=[self.optimizer, self.lr_scheduler], | ||
**self._sft_args.model_dump(), | ||
) | ||
|
||
trainer_stats = self._trainer.train() | ||
|
||
def save_model(self) -> None: | ||
self._trainer.model.save_pretrained(self._weights_path) | ||
self.tokenizer.save_pretrained(self._weights_path) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import Union, List, Tuple, Dict | ||
|
||
|
||
class Inference(ABC): | ||
@abstractmethod | ||
def infer_one(self, prompt: str): | ||
pass | ||
|
||
@abstractmethod | ||
def infer_all(self): | ||
pass |
Oops, something went wrong.