first commit on my remote

jjia1 · Jun 28, 2024 · 99d1ba2 · 99d1ba2
1 parent 62d67b5
commit 99d1ba2
Show file tree

Hide file tree

Showing 17 changed files with 1,525 additions and 265 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.env
+prompts/
+mpi_run_wrapper.sh
diff --git a/check_pkgs.py b/check_pkgs.py
@@ -0,0 +1,5 @@
+import pkg_resources
+
+print("Installed packages:")
+for pkg in pkg_resources.working_set:
+    print(f"{pkg.project_name} - {pkg.version}")
diff --git a/eval_model.py b/eval_model.py
@@ -0,0 +1,154 @@
+# Load imports
+import torch
+import random
+import numpy as np
+import pandas as pd
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import Dataset
+from sentence_transformers import SentenceTransformer
+import json
+
+pd.set_option('display.max_colwidth', None)
+
+# Implement T5 comparison
+model = SentenceTransformer('sentence-t5-base')
+
+def sharpened_cosine_similarity(vec1, vec2, exponent=3):
+    cosine_similarity = torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
+    return cosine_similarity ** exponent
+
+# Compare the similarity of the phrases
+def compare_phrases(test_phrase, predicted_phrase):
+    print(f"actual instruction: {test_phrase}")
+    print(f"predicted instruction: {predicted_phrase}")
+
+    test_embedding = model.encode(test_phrase, convert_to_tensor=True, show_progress_bar=False)
+
+    compare_embedding = model.encode(predicted_phrase, convert_to_tensor=True, show_progress_bar=False)
+    score = sharpened_cosine_similarity(test_embedding, compare_embedding).item()
+
+    print(f"Similarity score: {score}\n")
+
+    return(test_phrase, predicted_phrase, score)
+
+
+def generate_prompt(input_text, output_text):
+    return f"""
+    Given the original text and a transformed version of it, deduce the instructions that might have guided the transformation.
+    Original Text: "{input_text}"
+    Transformed Text: "{output_text}"
+    What instruction could have led to this transformation?
+    """
+
+
+
+def get_completion_merged(input_text: str, output_text: str, model, tokenizer) -> str:
+    # Template to define the prompt format
+    # Use the refined prompt format
+    prompt = generate_prompt(input_text, output_text)
+    #prompt = prompt_template.format(input=input_text, output=output_text)
+    encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+    if 'input_ids' in encodeds:
+        model_inputs = encodeds['input_ids'].to(device)
+        prompt_length = model_inputs.shape[1]
+    else:
+        raise ValueError("Tokenized inputs do not contain 'input_ids'.")
+
+    # Generate text from the model
+    generated_ids = model.generate(
+        inputs=model_inputs,
+        max_new_tokens=100,  # Adjusted to include max_length instead of max_new_tokens for better control
+        do_sample=True,  # Enable sampling for diverse output
+        top_k=80,  # Top-K sampling
+        pad_token_id=tokenizer.eos_token_id
+    )
+
+    # Decode generated tokens to text
+    decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+    return decoded[0]
+         # Decode generated tokens to text for each item in the batch
+    #decoded_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+    # Extract the first sentence from each decoded text
+    first_sentences = []
+    #for decoded in decoded_texts:
+    #    # Find the end of the first sentence using the first occurrence of any end punctuation
+    #    first_sentence_end = next((index for index, char in enumerate(decoded) if char in ".!?"), len(decoded))
+    #   first_sentence = decoded[:first_sentence_end + 1]
+    #    first_sentences.append(first_sentence)
+
+    # Return the list of first sentences or a single first sentence
+    # If you expect only one result, you can return just the first item
+    #return first_sentences[0] if first_sentences else ""
+
+
+# Model setup
+model_id="~/merged_llama2_70b_prompt_recovery_model_2024-04-16 00:36:44.688601" # instruct model
+tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right")
+tokenizer.pad_token = '[PAD]'
+tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+
+merged_model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2"
+)
+merged_model.eval()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+
+# Read the test data from JSON
+test_data_path = '~/llm_prompt_recovery/training_test_data/test_data.json'
+# Correct the reading method to handle line-delimited JSON
+test_data = pd.read_json(test_data_path, lines=True)
+
+test_data['text'] = test_data['text'].apply(lambda x: eval(x) if isinstance(x, str) else x)
+
+# Normalize the 'text' column to create a DataFrame
+test_data = pd.json_normalize(test_data['text'])
+
+# Define a helper function to format the comparison results in a more readable way
+def format_comparison(actual, predicted, score):
+    return json.dumps({
+        "Actual Instruction": actual,
+        "Generated Instruction": predicted,
+        "Similarity Score": score
+    }, indent=4)
+
+# Evaluate each entry in the test data and write results to a file
+model_name = model_id.split("/")[-1]
+output_filename = f'model_eval_{model_name}.txt'
+
+import traceback
+
+# Modify your loop to catch and print detailed error information
+with open(output_filename, 'w') as file:
+    scores = []  # List to store similarity scores
+    print("Starting the evaluation...")
+    total_entries = len(test_data)
+    for index, row in test_data.iterrows():
+        print(f"Processing {index + 1}/{total_entries}...")
+        try:
+            generated_instruction = get_completion_merged(row['input'], row['output'], merged_model, tokenizer)
+            test_phrase, predicted_phrase, score = compare_phrases(row['instruction'], generated_instruction)
+            scores.append(score)  # Append the score to the list
+            comparison_result = format_comparison(test_phrase, predicted_phrase, score)
+            file.write(comparison_result + '\n')
+            file.write("---\n")
+        except Exception as e:
+            error_msg = f"Error processing row {index}: {str(e)}\n"
+            file.write(error_msg)
+            print(error_msg)
+            print(traceback.format_exc())  # This will print the full traceback
+
+    if scores:
+        average_score = sum(scores) / len(scores)
+        average_result = f"Average T5 Similarity Score: {average_score:.4f}\n"
+        file.write(average_result)
+        print(average_result)
+    else:
+        print("No valid scores were calculated.")
+
+print(f"Results have been written to {output_filename}.")
+
diff --git a/finetune_model.py b/finetune_model.py
@@ -0,0 +1,158 @@
+import torch
+import random
+import numpy as np
+import pandas as pd
+import transformers
+import accelerate
+from accelerate import Accelerator
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from datasets import Dataset
+import json
+import os
+from huggingface_hub import login
+from dotenv import load_dotenv
+
+accelerator = Accelerator()
+device = accelerator.device
+
+hf_access_token=os.getenv("HF_TOKEN")
+
+login(token = hf_access_token)
+
+# Get date and time
+from datetime import datetime
+now = datetime.now()
+
+
+# Getting model and tokenizer
+# NOTE: Trying base model
+model_name = "meta-llama/Llama-2-70b-chat-hf"  # Specify the LLaMA-2 model nametokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token = True,
+                                          token = hf_access_token,
+                                          padding = "max_length",
+                                          )
+tokenizer.padding_side = 'right'
+tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+
+bnb_config = BitsAndBytesConfig(
+    load_in_8bit= False,
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+
+# Change this path to match your own working directory
+wd = os.getcwd()
+
+# Specify the path to your JSON file
+test_data_path = os.path.join(wd, "test_data.json")
+train_data_path = os.path.join(wd, "train_data.json")
+
+# Load the JSON data into DataFrames
+test_data_df = pd.read_json(test_data_path, lines=True)
+train_data_df = pd.read_json(train_data_path,lines=True)
+
+test_data_df = Dataset.from_pandas(test_data_df)
+train_data_df = Dataset.from_pandas(train_data_df)
+
+print(train_data_df)
+print(len(train_data_df['input_ids']))
+print(len(train_data_df['attention_mask']))
+print(len(train_data_df['text']))
+
+# load onto cuda device
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+# instantiate model
+base_model = AutoModelForCausalLM.from_pretrained(model_name, 
+                                             quantization_config=bnb_config, 
+                                             token=hf_access_token,
+                                             torch_dtype=torch.float16, 
+                                             attn_implementation="flash_attention_2",
+                                             device_map="auto"
+                                                 )
+base_model.config.use_cache = False # set true for inference?
+base_model.config.pretraining_tp = 1 
+
+# Wrap the model with DataParallel for multi-GPU usage
+if torch.cuda.device_count() > 1:
+    base_model = torch.nn.DataParallel(base_model)
+
+from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
+from trl import SFTTrainer
+import bitsandbytes as bnb
+
+base_model.gradient_checkpointing_enable()
+model = prepare_model_for_kbit_training(base_model)
+
+import bitsandbytes as bnb
+def find_all_linear_names(model):
+  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
+  lora_module_names = set()
+  for name, module in model.named_modules():
+    if isinstance(module, cls):
+      names = name.split('.')
+      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+      lora_module_names.remove('lm_head')
+  return list(lora_module_names)
+
+modules = find_all_linear_names(model)
+
+lora_config = LoraConfig(
+    lora_alpha=32 ,
+    lora_dropout=0.08,
+    target_modules=modules,
+    r=16,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+tokenizer.pad_token = tokenizer.eos_token
+
+model = get_peft_model(model, lora_config)
+trainable, total = model.get_nb_trainable_parameters()
+print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")
+
+output_dir = f"{wd}/{model_name}"
+training_args = transformers.TrainingArguments(
+    output_dir=output_dir,
+    warmup_steps=0.03,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    learning_rate=1e-3,
+    logging_steps=1,
+    max_steps=100,
+    optim="paged_adamw_8bit",
+    save_strategy="epoch",
+    hub_token = hf_access_token,
+)
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=train_data_df,
+    eval_dataset=test_data_df,
+    peft_config=lora_config,
+    dataset_text_field="text",
+    max_seq_length=4000,
+    tokenizer=tokenizer,
+    args=training_args,
+    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+)
+
+trainer.train()
+# Check if the directory exists, and create it if it doesn't
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+    print(f"Directory '{output_dir}' created.")
+else:
+    print(f"Directory '{output_dir}' already exists.")
+
+new_model_name = f"{output_dir}llama2_{model_name}_{now}"
+
+# Replace spaces with underscores
+new_model = new_model_name.replace(" ", "_")
+
+# save finetuned model
+trainer.model.save_pretrained(new_model)