-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
1,525 additions
and
265 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
.env | ||
prompts/ | ||
mpi_run_wrapper.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
import pkg_resources | ||
|
||
print("Installed packages:") | ||
for pkg in pkg_resources.working_set: | ||
print(f"{pkg.project_name} - {pkg.version}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
# Load imports | ||
import torch | ||
import random | ||
import numpy as np | ||
import pandas as pd | ||
from transformers import AutoModelForCausalLM, AutoTokenizer | ||
from datasets import Dataset | ||
from sentence_transformers import SentenceTransformer | ||
import json | ||
|
||
pd.set_option('display.max_colwidth', None) | ||
|
||
# Implement T5 comparison | ||
model = SentenceTransformer('sentence-t5-base') | ||
|
||
def sharpened_cosine_similarity(vec1, vec2, exponent=3): | ||
cosine_similarity = torch.nn.functional.cosine_similarity(vec1, vec2, dim=0) | ||
return cosine_similarity ** exponent | ||
|
||
# Compare the similarity of the phrases | ||
def compare_phrases(test_phrase, predicted_phrase): | ||
print(f"actual instruction: {test_phrase}") | ||
print(f"predicted instruction: {predicted_phrase}") | ||
|
||
test_embedding = model.encode(test_phrase, convert_to_tensor=True, show_progress_bar=False) | ||
|
||
compare_embedding = model.encode(predicted_phrase, convert_to_tensor=True, show_progress_bar=False) | ||
score = sharpened_cosine_similarity(test_embedding, compare_embedding).item() | ||
|
||
print(f"Similarity score: {score}\n") | ||
|
||
return(test_phrase, predicted_phrase, score) | ||
|
||
|
||
def generate_prompt(input_text, output_text): | ||
return f""" | ||
Given the original text and a transformed version of it, deduce the instructions that might have guided the transformation. | ||
Original Text: "{input_text}" | ||
Transformed Text: "{output_text}" | ||
What instruction could have led to this transformation? | ||
""" | ||
|
||
|
||
|
||
def get_completion_merged(input_text: str, output_text: str, model, tokenizer) -> str: | ||
# Template to define the prompt format | ||
# Use the refined prompt format | ||
prompt = generate_prompt(input_text, output_text) | ||
#prompt = prompt_template.format(input=input_text, output=output_text) | ||
encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) | ||
if 'input_ids' in encodeds: | ||
model_inputs = encodeds['input_ids'].to(device) | ||
prompt_length = model_inputs.shape[1] | ||
else: | ||
raise ValueError("Tokenized inputs do not contain 'input_ids'.") | ||
|
||
# Generate text from the model | ||
generated_ids = model.generate( | ||
inputs=model_inputs, | ||
max_new_tokens=100, # Adjusted to include max_length instead of max_new_tokens for better control | ||
do_sample=True, # Enable sampling for diverse output | ||
top_k=80, # Top-K sampling | ||
pad_token_id=tokenizer.eos_token_id | ||
) | ||
|
||
# Decode generated tokens to text | ||
decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) | ||
return decoded[0] | ||
# Decode generated tokens to text for each item in the batch | ||
#decoded_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) | ||
|
||
# Extract the first sentence from each decoded text | ||
first_sentences = [] | ||
#for decoded in decoded_texts: | ||
# # Find the end of the first sentence using the first occurrence of any end punctuation | ||
# first_sentence_end = next((index for index, char in enumerate(decoded) if char in ".!?"), len(decoded)) | ||
# first_sentence = decoded[:first_sentence_end + 1] | ||
# first_sentences.append(first_sentence) | ||
|
||
# Return the list of first sentences or a single first sentence | ||
# If you expect only one result, you can return just the first item | ||
#return first_sentences[0] if first_sentences else "" | ||
|
||
|
||
# Model setup | ||
model_id="~/merged_llama2_70b_prompt_recovery_model_2024-04-16 00:36:44.688601" # instruct model | ||
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right") | ||
tokenizer.pad_token = '[PAD]' | ||
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | ||
|
||
merged_model = AutoModelForCausalLM.from_pretrained( | ||
model_id, | ||
device_map="auto", | ||
torch_dtype=torch.bfloat16, | ||
attn_implementation="flash_attention_2" | ||
) | ||
merged_model.eval() | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
model.to(device) | ||
|
||
# Read the test data from JSON | ||
test_data_path = '~/llm_prompt_recovery/training_test_data/test_data.json' | ||
# Correct the reading method to handle line-delimited JSON | ||
test_data = pd.read_json(test_data_path, lines=True) | ||
|
||
test_data['text'] = test_data['text'].apply(lambda x: eval(x) if isinstance(x, str) else x) | ||
|
||
# Normalize the 'text' column to create a DataFrame | ||
test_data = pd.json_normalize(test_data['text']) | ||
|
||
# Define a helper function to format the comparison results in a more readable way | ||
def format_comparison(actual, predicted, score): | ||
return json.dumps({ | ||
"Actual Instruction": actual, | ||
"Generated Instruction": predicted, | ||
"Similarity Score": score | ||
}, indent=4) | ||
|
||
# Evaluate each entry in the test data and write results to a file | ||
model_name = model_id.split("/")[-1] | ||
output_filename = f'model_eval_{model_name}.txt' | ||
|
||
import traceback | ||
|
||
# Modify your loop to catch and print detailed error information | ||
with open(output_filename, 'w') as file: | ||
scores = [] # List to store similarity scores | ||
print("Starting the evaluation...") | ||
total_entries = len(test_data) | ||
for index, row in test_data.iterrows(): | ||
print(f"Processing {index + 1}/{total_entries}...") | ||
try: | ||
generated_instruction = get_completion_merged(row['input'], row['output'], merged_model, tokenizer) | ||
test_phrase, predicted_phrase, score = compare_phrases(row['instruction'], generated_instruction) | ||
scores.append(score) # Append the score to the list | ||
comparison_result = format_comparison(test_phrase, predicted_phrase, score) | ||
file.write(comparison_result + '\n') | ||
file.write("---\n") | ||
except Exception as e: | ||
error_msg = f"Error processing row {index}: {str(e)}\n" | ||
file.write(error_msg) | ||
print(error_msg) | ||
print(traceback.format_exc()) # This will print the full traceback | ||
|
||
if scores: | ||
average_score = sum(scores) / len(scores) | ||
average_result = f"Average T5 Similarity Score: {average_score:.4f}\n" | ||
file.write(average_result) | ||
print(average_result) | ||
else: | ||
print("No valid scores were calculated.") | ||
|
||
print(f"Results have been written to {output_filename}.") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
import torch | ||
import random | ||
import numpy as np | ||
import pandas as pd | ||
import transformers | ||
import accelerate | ||
from accelerate import Accelerator | ||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | ||
from datasets import Dataset | ||
import json | ||
import os | ||
from huggingface_hub import login | ||
from dotenv import load_dotenv | ||
|
||
accelerator = Accelerator() | ||
device = accelerator.device | ||
|
||
hf_access_token=os.getenv("HF_TOKEN") | ||
|
||
login(token = hf_access_token) | ||
|
||
# Get date and time | ||
from datetime import datetime | ||
now = datetime.now() | ||
|
||
|
||
# Getting model and tokenizer | ||
# NOTE: Trying base model | ||
model_name = "meta-llama/Llama-2-70b-chat-hf" # Specify the LLaMA-2 model nametokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True) | ||
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token = True, | ||
token = hf_access_token, | ||
padding = "max_length", | ||
) | ||
tokenizer.padding_side = 'right' | ||
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | ||
|
||
bnb_config = BitsAndBytesConfig( | ||
load_in_8bit= False, | ||
load_in_4bit=True, | ||
bnb_4bit_use_double_quant=True, | ||
bnb_4bit_quant_type="nf4", | ||
bnb_4bit_compute_dtype=torch.bfloat16 | ||
) | ||
|
||
# Change this path to match your own working directory | ||
wd = os.getcwd() | ||
|
||
# Specify the path to your JSON file | ||
test_data_path = os.path.join(wd, "test_data.json") | ||
train_data_path = os.path.join(wd, "train_data.json") | ||
|
||
# Load the JSON data into DataFrames | ||
test_data_df = pd.read_json(test_data_path, lines=True) | ||
train_data_df = pd.read_json(train_data_path,lines=True) | ||
|
||
test_data_df = Dataset.from_pandas(test_data_df) | ||
train_data_df = Dataset.from_pandas(train_data_df) | ||
|
||
print(train_data_df) | ||
print(len(train_data_df['input_ids'])) | ||
print(len(train_data_df['attention_mask'])) | ||
print(len(train_data_df['text'])) | ||
|
||
# load onto cuda device | ||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | ||
|
||
# instantiate model | ||
base_model = AutoModelForCausalLM.from_pretrained(model_name, | ||
quantization_config=bnb_config, | ||
token=hf_access_token, | ||
torch_dtype=torch.float16, | ||
attn_implementation="flash_attention_2", | ||
device_map="auto" | ||
) | ||
base_model.config.use_cache = False # set true for inference? | ||
base_model.config.pretraining_tp = 1 | ||
|
||
# Wrap the model with DataParallel for multi-GPU usage | ||
if torch.cuda.device_count() > 1: | ||
base_model = torch.nn.DataParallel(base_model) | ||
|
||
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model | ||
from trl import SFTTrainer | ||
import bitsandbytes as bnb | ||
|
||
base_model.gradient_checkpointing_enable() | ||
model = prepare_model_for_kbit_training(base_model) | ||
|
||
import bitsandbytes as bnb | ||
def find_all_linear_names(model): | ||
cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear) | ||
lora_module_names = set() | ||
for name, module in model.named_modules(): | ||
if isinstance(module, cls): | ||
names = name.split('.') | ||
lora_module_names.add(names[0] if len(names) == 1 else names[-1]) | ||
if 'lm_head' in lora_module_names: # needed for 16-bit | ||
lora_module_names.remove('lm_head') | ||
return list(lora_module_names) | ||
|
||
modules = find_all_linear_names(model) | ||
|
||
lora_config = LoraConfig( | ||
lora_alpha=32 , | ||
lora_dropout=0.08, | ||
target_modules=modules, | ||
r=16, | ||
bias="none", | ||
task_type="CAUSAL_LM", | ||
) | ||
|
||
tokenizer.pad_token = tokenizer.eos_token | ||
|
||
model = get_peft_model(model, lora_config) | ||
trainable, total = model.get_nb_trainable_parameters() | ||
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%") | ||
|
||
output_dir = f"{wd}/{model_name}" | ||
training_args = transformers.TrainingArguments( | ||
output_dir=output_dir, | ||
warmup_steps=0.03, | ||
per_device_train_batch_size=1, | ||
gradient_accumulation_steps=4, | ||
learning_rate=1e-3, | ||
logging_steps=1, | ||
max_steps=100, | ||
optim="paged_adamw_8bit", | ||
save_strategy="epoch", | ||
hub_token = hf_access_token, | ||
) | ||
|
||
trainer = SFTTrainer( | ||
model=model, | ||
train_dataset=train_data_df, | ||
eval_dataset=test_data_df, | ||
peft_config=lora_config, | ||
dataset_text_field="text", | ||
max_seq_length=4000, | ||
tokenizer=tokenizer, | ||
args=training_args, | ||
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), | ||
) | ||
|
||
trainer.train() | ||
# Check if the directory exists, and create it if it doesn't | ||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir) | ||
print(f"Directory '{output_dir}' created.") | ||
else: | ||
print(f"Directory '{output_dir}' already exists.") | ||
|
||
new_model_name = f"{output_dir}llama2_{model_name}_{now}" | ||
|
||
# Replace spaces with underscores | ||
new_model = new_model_name.replace(" ", "_") | ||
|
||
# save finetuned model | ||
trainer.model.save_pretrained(new_model) |
Oops, something went wrong.