-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfine_tuning_data_setup.py
212 lines (194 loc) · 7.93 KB
/
fine_tuning_data_setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import os
import re
from typing import Union
from langchain.embeddings.openai import OpenAIEmbeddings
import openai
import faiss
from langchain.docstore import InMemoryDocstore
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from utils import file_reader
import dotenv
from tqdm import tqdm
from transformers import GPT2Tokenizer
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")
dotenv.load_dotenv()
openai.api_key = os.environ.get('OPENAI_API_KEY')
embeder = OpenAIEmbeddings()
JSON_TEMPLATE = {
"messages": [{
"role": "system",
"content": "You are a resume parser. Extract the "}, # the type of data to extract
{
"role": "user",
"content": ""}, # the resume as a whole
{
"role": "assistant",
"content": ""}]} # the section extracted
def process_profiles(path: str):
"""
Process profiles_without_dot in the specified directory and return a list of JSON objects.
"""
profiles_data = []
for dir_path in tqdm(os.listdir(path)):
aim_profile = ''
resume = ''
full_dir_path = os.path.join(path, dir_path)
for file in os.listdir(full_dir_path):
if file.startswith('~'):
continue
file_path = os.path.join(full_dir_path, file)
mode = 'r' if file.endswith('.txt') else 'rb'
with open(file_path, mode) as f:
try:
if file.startswith('AIM P'):
aim_profile = file_reader(f)
else:
resume = file_reader(f)
except Exception as e:
print(f'Error reading {file_path}: {e}')
continue
try:
summary, skills, experience, education, certifications, awards = parse_aim_resumes(aim_profile)
except Exception as e:
print(f'Error parsing {dir_path}: {e}')
continue
if not resume:
continue
try:
profile_data = {
"resume": resume,
"summary": summary,
"skills": skills,
"experience": experience,
"education": education,
"certifications": certifications,
"awards": awards,
}
profiles_data.append(profile_data)
except Exception as e:
print(f'Error embedding {dir_path}: {e}')
continue
return profiles_data
# def parse_aim_resumes(resume: str):
# """
# AIM resumes have a specific, consistent format.
# """
# splits = ['SUMMARY', "SKILLS AND TECHNOLOGIES", "PROFESSIONAL EXPERIENCE", "EDUCATION", "CERTIFICATIONS"]
# summary = resume.split(splits[0], 1)[1].split(splits[1], 1)[0]
# skills_and_tech = resume.split(splits[1], 1)[1].split(splits[2], 1)[0]
# professional_experience = resume.split(splits[2], 1)[1].split(splits[3], 1)[0]
# education = resume.split(splits[3], 1)[1].split(splits[4], 1)[0]
# certifications = resume.split(splits[4], 1)[1]
#
#
# return summary, skills_and_tech, professional_experience, education, certifications
def parse_aim_resumes(resume: str):
"""
AIM resumes have a specific, consistent format.
"""
# Regular expressions for each section
section_patterns = {
'summary': r'SUMMARY\s*(.*?)(?=SKILLS AND TECHNOLOG(?:Y|IES)|PROFESSIONAL EXPERIENCE|EDUCATION|CERTIFICATIONS|AWARDS|$)',
'skills_and_tech': r'SKILLS AND TECHNOLOG(?:Y|IES)\s*(.*?)(?=PROFESSIONAL EXPERIENCE|EDUCATION|CERTIFICATIONS|AWARDS|$)',
'professional_experience': r'PROFESSIONAL EXPERIENCE\s*(.*?)(?=EDUCATION|CERTIFICATIONS|AWARDS|$)',
'education': r'EDUCATION\s*(.*?)(?=CERTIFICATIONS|AWARDS|$)',
'certifications': r'CERTIFICATIONS\s*(.*?)(?=AWARDS|$)',
'awards': r'AWARDS\s*(.*)'
}
# Extract sections using regex
sections = {}
for section_name, pattern in section_patterns.items():
match = re.search(pattern, resume, re.DOTALL)
sections[section_name] = match.group(1).strip() if match else None
return (sections['summary'], sections['skills_and_tech'], sections['professional_experience'],
sections['education'], sections['certifications'], sections['awards'])
def format_for_finetune(json_path: Union[str, bytes, os.PathLike], save_path: Union[str, bytes, os.PathLike]):
"""
Format the data for fine-tuning saving a jsonl file.
"""
if os.path.exists(save_path):
os.remove(save_path)
succeeded = 0
failed = 0
with open(json_path, 'r') as f:
profiles_data = json.load(f)
for idx, profile_data in enumerate(profiles_data):
resume = profile_data['resume']
for key, value in profile_data.items():
json_format = JSON_TEMPLATE.copy()
if (key == 'resume') or (not value):
continue
json_format['messages'][1]['content'] = resume
json_format['messages'][2]['content'] = value
json_format['messages'][0]['content'] = f'You are a resume parser. Extract the {key} section'
# check if total length is less than 4096
full_str = str(json_format)
if len(tokenizer.encode(full_str)) > 16000:
print(f'Length of {idx} is greater than 12096. Length: {len(tokenizer.encode(full_str))}')
failed += 1
del json_format
continue
succeeded += 1
with open(save_path, 'a') as f:
json.dump(json_format, f)
f.write('\n')
del json_format
print(f'Succeeded: {succeeded}, Failed: {failed}')
print(f'Ratios: {succeeded / (succeeded + failed)}')
def start_finetuning_job(file_path: Union[str, bytes, os.PathLike]):
"""
Start a fine-tuning job.
Current model
{
"object": "fine_tuning.job",
"id": "ftjob-jHaLv1MpjJSs1wRPhxdiYXla",
"model": "gpt-3.5-turbo-0613",
"created_at": 1693960013,
"finished_at": 1693963571,
"fine_tuned_model": "ft:gpt-3.5-turbo-0613:personal::7vbb2i7t",
"organization_id": "org-ev4E5NkqvqtGmw8yJ80vok7v",
"result_files": [
"file-U99WerWwYyvENhHP0ckn7q1M"
],
"status": "succeeded",
"validation_file": null,
"training_file": "file-LdgbGbVzGwXs7pIzdOf62NbU",
"hyperparameters": {
"n_epochs": 3
},
"trained_tokens": 2265357
}
"""
filename = os.path.basename(file_path)
# check if job is running already
jobs = openai.FineTuningJob.list()
# if jobs['data']:
# print(f'Job {filename} is already running.')
# return
data_resp = openai.File.create(
file=open(file_path, "rb"),
purpose='fine-tune',
user_provided_filename=filename
)
while data_resp['status'] != 'processed':
data_resp = openai.File.retrieve(id=data_resp['id'])
response = openai.FineTuningJob.create(training_file=data_resp['id'], model="gpt-3.5-turbo-0125")
print(response)
while response['status'] != 'succeeded':
response = openai.FineTuningJob.retrieve(id=response['id'])
return response
if __name__ == '__main__':
import json
FINETUNE = True
dot = '_with_dot'
profiles_data = process_profiles(path='profile_types/profiles' + dot)
with open('profiles_data.json', 'w') as f:
json.dump(profiles_data, f, indent=4)
format_for_finetune(json_path='profiles_data.json', save_path=f'fine_tuning_data{dot}.jsonl')
if FINETUNE:
job = start_finetuning_job(file_path=f'fine_tuning_data{dot}.jsonl')
print("=====================================")
print(f"Successfully started fine-tuning job: {job['id']}")
print("=====================================")