Skip to content

Commit

Permalink
first commit on my remote
Browse files Browse the repository at this point in the history
  • Loading branch information
jjia1 committed Jun 28, 2024
1 parent 62d67b5 commit 99d1ba2
Show file tree
Hide file tree
Showing 17 changed files with 1,525 additions and 265 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.env
prompts/
mpi_run_wrapper.sh
5 changes: 5 additions & 0 deletions check_pkgs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import pkg_resources

print("Installed packages:")
for pkg in pkg_resources.working_set:
print(f"{pkg.project_name} - {pkg.version}")
154 changes: 154 additions & 0 deletions eval_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# Load imports
import torch
import random
import numpy as np
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from sentence_transformers import SentenceTransformer
import json

pd.set_option('display.max_colwidth', None)

# Implement T5 comparison
model = SentenceTransformer('sentence-t5-base')

def sharpened_cosine_similarity(vec1, vec2, exponent=3):
cosine_similarity = torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
return cosine_similarity ** exponent

# Compare the similarity of the phrases
def compare_phrases(test_phrase, predicted_phrase):
print(f"actual instruction: {test_phrase}")
print(f"predicted instruction: {predicted_phrase}")

test_embedding = model.encode(test_phrase, convert_to_tensor=True, show_progress_bar=False)

compare_embedding = model.encode(predicted_phrase, convert_to_tensor=True, show_progress_bar=False)
score = sharpened_cosine_similarity(test_embedding, compare_embedding).item()

print(f"Similarity score: {score}\n")

return(test_phrase, predicted_phrase, score)


def generate_prompt(input_text, output_text):
return f"""
Given the original text and a transformed version of it, deduce the instructions that might have guided the transformation.
Original Text: "{input_text}"
Transformed Text: "{output_text}"
What instruction could have led to this transformation?
"""



def get_completion_merged(input_text: str, output_text: str, model, tokenizer) -> str:
# Template to define the prompt format
# Use the refined prompt format
prompt = generate_prompt(input_text, output_text)
#prompt = prompt_template.format(input=input_text, output=output_text)
encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
if 'input_ids' in encodeds:
model_inputs = encodeds['input_ids'].to(device)
prompt_length = model_inputs.shape[1]
else:
raise ValueError("Tokenized inputs do not contain 'input_ids'.")

# Generate text from the model
generated_ids = model.generate(
inputs=model_inputs,
max_new_tokens=100, # Adjusted to include max_length instead of max_new_tokens for better control
do_sample=True, # Enable sampling for diverse output
top_k=80, # Top-K sampling
pad_token_id=tokenizer.eos_token_id
)

# Decode generated tokens to text
decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
return decoded[0]
# Decode generated tokens to text for each item in the batch
#decoded_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

# Extract the first sentence from each decoded text
first_sentences = []
#for decoded in decoded_texts:
# # Find the end of the first sentence using the first occurrence of any end punctuation
# first_sentence_end = next((index for index, char in enumerate(decoded) if char in ".!?"), len(decoded))
# first_sentence = decoded[:first_sentence_end + 1]
# first_sentences.append(first_sentence)

# Return the list of first sentences or a single first sentence
# If you expect only one result, you can return just the first item
#return first_sentences[0] if first_sentences else ""


# Model setup
model_id="~/merged_llama2_70b_prompt_recovery_model_2024-04-16 00:36:44.688601" # instruct model
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right")
tokenizer.pad_token = '[PAD]'
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

merged_model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
)
merged_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Read the test data from JSON
test_data_path = '~/llm_prompt_recovery/training_test_data/test_data.json'
# Correct the reading method to handle line-delimited JSON
test_data = pd.read_json(test_data_path, lines=True)

test_data['text'] = test_data['text'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Normalize the 'text' column to create a DataFrame
test_data = pd.json_normalize(test_data['text'])

# Define a helper function to format the comparison results in a more readable way
def format_comparison(actual, predicted, score):
return json.dumps({
"Actual Instruction": actual,
"Generated Instruction": predicted,
"Similarity Score": score
}, indent=4)

# Evaluate each entry in the test data and write results to a file
model_name = model_id.split("/")[-1]
output_filename = f'model_eval_{model_name}.txt'

import traceback

# Modify your loop to catch and print detailed error information
with open(output_filename, 'w') as file:
scores = [] # List to store similarity scores
print("Starting the evaluation...")
total_entries = len(test_data)
for index, row in test_data.iterrows():
print(f"Processing {index + 1}/{total_entries}...")
try:
generated_instruction = get_completion_merged(row['input'], row['output'], merged_model, tokenizer)
test_phrase, predicted_phrase, score = compare_phrases(row['instruction'], generated_instruction)
scores.append(score) # Append the score to the list
comparison_result = format_comparison(test_phrase, predicted_phrase, score)
file.write(comparison_result + '\n')
file.write("---\n")
except Exception as e:
error_msg = f"Error processing row {index}: {str(e)}\n"
file.write(error_msg)
print(error_msg)
print(traceback.format_exc()) # This will print the full traceback

if scores:
average_score = sum(scores) / len(scores)
average_result = f"Average T5 Similarity Score: {average_score:.4f}\n"
file.write(average_result)
print(average_result)
else:
print("No valid scores were calculated.")

print(f"Results have been written to {output_filename}.")

158 changes: 158 additions & 0 deletions finetune_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import torch
import random
import numpy as np
import pandas as pd
import transformers
import accelerate
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
import json
import os
from huggingface_hub import login
from dotenv import load_dotenv

accelerator = Accelerator()
device = accelerator.device

hf_access_token=os.getenv("HF_TOKEN")

login(token = hf_access_token)

# Get date and time
from datetime import datetime
now = datetime.now()


# Getting model and tokenizer
# NOTE: Trying base model
model_name = "meta-llama/Llama-2-70b-chat-hf" # Specify the LLaMA-2 model nametokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token = True,
token = hf_access_token,
padding = "max_length",
)
tokenizer.padding_side = 'right'
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

bnb_config = BitsAndBytesConfig(
load_in_8bit= False,
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

# Change this path to match your own working directory
wd = os.getcwd()

# Specify the path to your JSON file
test_data_path = os.path.join(wd, "test_data.json")
train_data_path = os.path.join(wd, "train_data.json")

# Load the JSON data into DataFrames
test_data_df = pd.read_json(test_data_path, lines=True)
train_data_df = pd.read_json(train_data_path,lines=True)

test_data_df = Dataset.from_pandas(test_data_df)
train_data_df = Dataset.from_pandas(train_data_df)

print(train_data_df)
print(len(train_data_df['input_ids']))
print(len(train_data_df['attention_mask']))
print(len(train_data_df['text']))

# load onto cuda device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# instantiate model
base_model = AutoModelForCausalLM.from_pretrained(model_name,
quantization_config=bnb_config,
token=hf_access_token,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
device_map="auto"
)
base_model.config.use_cache = False # set true for inference?
base_model.config.pretraining_tp = 1

# Wrap the model with DataParallel for multi-GPU usage
if torch.cuda.device_count() > 1:
base_model = torch.nn.DataParallel(base_model)

from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
import bitsandbytes as bnb

base_model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(base_model)

import bitsandbytes as bnb
def find_all_linear_names(model):
cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
return list(lora_module_names)

modules = find_all_linear_names(model)

lora_config = LoraConfig(
lora_alpha=32 ,
lora_dropout=0.08,
target_modules=modules,
r=16,
bias="none",
task_type="CAUSAL_LM",
)

tokenizer.pad_token = tokenizer.eos_token

model = get_peft_model(model, lora_config)
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

output_dir = f"{wd}/{model_name}"
training_args = transformers.TrainingArguments(
output_dir=output_dir,
warmup_steps=0.03,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
learning_rate=1e-3,
logging_steps=1,
max_steps=100,
optim="paged_adamw_8bit",
save_strategy="epoch",
hub_token = hf_access_token,
)

trainer = SFTTrainer(
model=model,
train_dataset=train_data_df,
eval_dataset=test_data_df,
peft_config=lora_config,
dataset_text_field="text",
max_seq_length=4000,
tokenizer=tokenizer,
args=training_args,
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()
# Check if the directory exists, and create it if it doesn't
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Directory '{output_dir}' created.")
else:
print(f"Directory '{output_dir}' already exists.")

new_model_name = f"{output_dir}llama2_{model_name}_{now}"

# Replace spaces with underscores
new_model = new_model_name.replace(" ", "_")

# save finetuned model
trainer.model.save_pretrained(new_model)
Loading

0 comments on commit 99d1ba2

Please sign in to comment.