Skip to content

Commit

Permalink
toolkit scripts added
Browse files Browse the repository at this point in the history
  • Loading branch information
RohitSaha committed Mar 27, 2024
1 parent 8cd045d commit 50e8708
Show file tree
Hide file tree
Showing 24 changed files with 1,677 additions and 0 deletions.
72 changes: 72 additions & 0 deletions config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
save_dir: "./experiment/"

ablation:
use_ablate: false

# Data Ingestion -------------------
data:
file_type: "huggingface" # one of 'json', 'csv', 'huggingface'
path: "yahma/alpaca-cleaned"
prompt:
>- # prompt, make sure column inputs are enclosed in {} brackets and that they match your data
Below is an instruction that describes a task.
Write a response that appropriately completes the request.
### Instruction: {instruction}
### Input: {input}
### Output:
prompt_stub:
>- # Stub to add for training at the end of prompt, for test set or inference, this is omitted; make sure only one variable is present
{output}
test_size: 0.1 # Proportion of test as % of total; if integer then # of samples
train_size: 0.9 # Proportion of train as % of total; if integer then # of samples
train_test_split_seed: 42

# Model Definition -------------------
model:
hf_model_ckpt: "NousResearch/Llama-2-7b-hf"
quantize: true
bitsandbytes:
load_in_4bit: true
bnb_4bit_compute_dtype: "bfloat16"
bnb_4bit_quant_type: "nf4"

# LoRA Params -------------------
lora:
task_type: "CAUSAL_LM"
r: 32
lora_dropout: 0.1
target_modules:
- q_proj
- v_proj
- k_proj
- o_proj
- up_proj
- down_proj
- gate_proj

# Training -------------------
training:
training_args:
num_train_epochs: 5
per_device_train_batch_size: 4
gradient_accumulation_steps: 4
gradient_checkpointing: True
optim: "paged_adamw_32bit"
logging_steps: 100
learning_rate: 2.0e-4
bf16: true # Set to true for mixed precision training on Newer GPUs
tf32: true
# fp16: false # Set to true for mixed precision training on Older GPUs
max_grad_norm: 0.3
warmup_ratio: 0.03
lr_scheduler_type: "constant"
sft_args:
max_seq_length: 5000
# neftune_noise_alpha: None

inference:
max_new_tokens: 1024
use_cache: True
do_sample: True
top_p: 0.9
temperature: 0.8
Empty file added src/__init__.py
Empty file.
Empty file added src/data/__init__.py
Empty file.
95 changes: 95 additions & 0 deletions src/data/dataset_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os
from os.path import join, exists
from functools import partial
from typing import Tuple, Union
import pickle

import re
from datasets import Dataset
from rich.console import Console
from rich.layout import Layout
from rich.panel import Panel

from src.data.ingestor import Ingestor, get_ingestor


class DatasetGenerator:
def __init__(
self,
file_type: str,
path: str,
prompt: str,
prompt_stub: str,
test_size: Union[float, int],
train_size: Union[float, int],
train_test_split_seed: int,
):
self.ingestor: Ingestor = get_ingestor(file_type)
self.ingestor: Ingestor = self.ingestor(path)

self.dataset: Dataset = self.ingestor.to_dataset()
self.prompt: str = prompt
self.prompt_stub: str = prompt_stub
self.test_size = test_size
self.train_size = train_size
self.train_test_split_seed: int = train_test_split_seed

self.train_columns: list = self._get_train_columns()
self.test_column: str = self._get_test_column()

def _get_train_columns(self):
pattern = r"\{([^}]*)\}"
return re.findall(pattern, self.prompt)

def _get_test_column(self):
pattern = r"\{([^}]*)\}"
return re.findall(pattern, self.prompt_stub)[0]

# TODO: stratify_by_column
def _train_test_split(self):
self.dataset = self.dataset.train_test_split(
test_size=self.test_size,
train_size=self.train_size,
seed=self.train_test_split_seed,
)

def _format_one_prompt(self, example, is_test: bool = False):
train_mapping = {var_name: example[var_name] for var_name in self.train_columns}
example["formatted_prompt"] = self.prompt.format(**train_mapping)

if not is_test:
test_mapping = {self.test_column: example[self.test_column]}
example["formatted_prompt"] += self.prompt_stub.format(**test_mapping)

return example

def _format_prompts(self):
self.dataset["train"] = self.dataset["train"].map(
partial(self._format_one_prompt, is_test=False)
)
self.dataset["test"] = self.dataset["test"].map(
partial(self._format_one_prompt, is_test=True)
)

def get_dataset(self) -> Tuple[Dataset, Dataset]:
self._train_test_split()
self._format_prompts()

return self.dataset["train"], self.dataset["test"]

def save_dataset(self, save_dir: str):
os.makedirs(save_dir, exist_ok=True)
with open(join(save_dir, "dataset.pkl"), "wb") as f:
pickle.dump(self.dataset, f)

def load_dataset_from_pickle(self, save_dir: str):
data_path = join(save_dir, "dataset.pkl")

if not exists(data_path):
raise FileNotFoundError(f"Train set pickle not found at {save_dir}")

with open(data_path, "rb") as f:
data = pickle.load(f)
self.dataset = data

return self.dataset["train"], self.dataset["test"]
61 changes: 61 additions & 0 deletions src/data/ingestor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from abc import ABC, abstractmethod
from functools import partial

import ijson
import csv
from datasets import Dataset, load_dataset, concatenate_datasets


def get_ingestor(data_type: str):
if data_type == "json":
return JsonIngestor
elif data_type == "csv":
return CsvIngestor
elif data_type == "huggingface":
return HuggingfaceIngestor
else:
raise ValueError(
f"'type' must be one of 'json', 'csv', or 'huggingface', you have {data_type}"
)


class Ingestor(ABC):
@abstractmethod
def to_dataset(self) -> Dataset:
pass


class JsonIngestor(Ingestor):
def __init__(self, path: str):
self.path = path

def _json_generator(self):
with open(self.path, "rb") as f:
for item in ijson.items(f, "item"):
yield item

def to_dataset(self) -> Dataset:
return Dataset.from_generator(self._json_generator)


class CsvIngestor(Ingestor):
def __init__(self, path: str):
self.path = path

def _csv_generator(self):
with open(self.path) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
yield row

def to_dataset(self) -> Dataset:
return Dataset.from_generator(self._csv_generator)


class HuggingfaceIngestor(Ingestor):
def __init__(self, path: str):
self.path = path

def to_dataset(self) -> Dataset:
ds = load_dataset(self.path)
return concatenate_datasets(ds.values())
Empty file added src/finetune/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions src/finetune/finetune.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from abc import ABC, abstractmethod
from typing import Union, List, Tuple, Dict


class Finetune(ABC):
@abstractmethod
def finetune(self):
pass

@abstractmethod
def save_model(self):
pass
140 changes: 140 additions & 0 deletions src/finetune/lora.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from os.path import join, exists
from typing import Tuple

import torch

import bitsandbytes as bnb
from datasets import Dataset
from accelerate import Accelerator
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
TrainingArguments,
AutoTokenizer,
ProgressCallback,
)
from peft import (
prepare_model_for_kbit_training,
get_peft_model,
LoraConfig,
)
from trl import SFTTrainer
from rich.console import Console


from src.pydantic_models.config_model import Config
from src.utils.save_utils import DirectoryHelper
from src.finetune.finetune import Finetune
from src.ui.rich_ui import RichUI


class LoRAFinetune(Finetune):
def __init__(self, config: Config, directory_helper: DirectoryHelper):
self.config = config

self._model_config = config.model
self._training_args = config.training.training_args
self._sft_args = config.training.sft_args
self._lora_config = LoraConfig(**config.lora.model_dump())
self._directory_helper = directory_helper
self._weights_path = self._directory_helper.save_paths.weights
self._trainer = None

self.model = None
self.tokenizer = None

""" TODO: Figure out how to handle multi-gpu
if config.accelerate:
self.accelerator = Accelerator()
self.accelerator.state.deepspeed_plugin.deepspeed_config[
"train_micro_batch_size_per_gpu"
] = self.config.training.training_args.per_device_train_batch_size
if config.accelerate:
# device_index = Accelerator().process_index
self.device_map = None #{"": device_index}
else:
"""
self.device_map = self._model_config.device_map

self._load_model_and_tokenizer()

def _load_model_and_tokenizer(self):
ckpt = self._model_config.hf_model_ckpt
RichUI.on_basemodel_load(ckpt)
model = self._get_model()
tokenizer = self._get_tokenizer()
RichUI.after_basemodel_load(ckpt)

self.model = model
self.tokenizer = tokenizer

def _get_model(self):
model = AutoModelForCausalLM.from_pretrained(
self._model_config.hf_model_ckpt,
quantization_config=(
BitsAndBytesConfig(**self._model_config.bitsandbytes.model_dump())
if not self.config.accelerate
else None
),
use_cache=False,
device_map=self.device_map,
)

model.config.pretraining_tp = 1

return model

def _get_tokenizer(self):
tokenizer = AutoTokenizer.from_pretrained(self._model_config.hf_model_ckpt)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

return tokenizer

def _inject_lora(self):
if not self.config.accelerate:
self.model.gradient_checkpointing_enable()
self.model = prepare_model_for_kbit_training(self.model)
self.model = get_peft_model(self.model, self._lora_config)

if not self.config.accelerate:
self.optimizer = bnb.optim.Adam8bit(
self.model.parameters(), lr=self._training_args.learning_rate
)
self.lr_scheduler = torch.optim.lr_scheduler.ConstantLR(self.optimizer)
if self.config.accelerate:
self.model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
self.model, self.optimizer, self.lr_scheduler
)

def finetune(self, train_dataset: Dataset):
logging_dir = join(self._weights_path, "/logs")
training_args = TrainingArguments(
output_dir=self._weights_path,
logging_dir=logging_dir,
report_to="none",
**self._training_args.model_dump(),
)

progress_callback = ProgressCallback()

self._trainer = SFTTrainer(
model=self.model,
train_dataset=train_dataset,
peft_config=self._lora_config,
tokenizer=self.tokenizer,
packing=True,
args=training_args,
dataset_text_field="formatted_prompt", # TODO: maybe move consts to a dedicated folder
callbacks=[progress_callback],
# optimizers=[self.optimizer, self.lr_scheduler],
**self._sft_args.model_dump(),
)

trainer_stats = self._trainer.train()

def save_model(self) -> None:
self._trainer.model.save_pretrained(self._weights_path)
self.tokenizer.save_pretrained(self._weights_path)
Empty file added src/inference/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions src/inference/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from abc import ABC, abstractmethod
from typing import Union, List, Tuple, Dict


class Inference(ABC):
@abstractmethod
def infer_one(self, prompt: str):
pass

@abstractmethod
def infer_all(self):
pass
Loading

0 comments on commit 50e8708

Please sign in to comment.