-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
64 lines (48 loc) · 3.04 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle
# Load job and user CSV files (use the correct paths to your files)
jobs = pd.read_csv('/Users/amalmr/Desktop/Freelancing AI 2/archive/jobs_rows.csv')
users = pd.read_csv('/Users/amalmr/Desktop/Freelancing AI 2/archive/user_large.csv')
# Check the structure of the data and print column names to inspect
print("Jobs columns:", jobs.columns) # To check what the column names are
print("Users columns:", users.columns) # To check user file column names
# Data preprocessing - clean and filter
# Ensure the 'skills' column is a list of skills by splitting it based on commas
jobs['skills'] = jobs['skills'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
users['skills'] = users['skills'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
# Filter users based on specific conditions (e.g., users with at least 5 years of experience)
filtered_users = users[users['experience_years'] >= 5]
# Function to compute skill match score (intersection of job and user skills)
def skill_match(job_skills, user_skills):
return len(set(job_skills) & set(user_skills))
# Add a new column for skill match score between jobs and filtered users
user_job_match = []
for _, job in jobs.iterrows():
for _, user in filtered_users.iterrows():
match_score = skill_match(job['skills'], user['skills'])
user_job_match.append([job['id'], user['user_id'], match_score]) # Adjust column names based on the actual data
# If 'Job_ID' is not correct, you will need to replace it with the actual job ID column name from the jobs dataset.
# Similarly, if 'user_id' is incorrect, replace it with the correct column name from the users dataset.
# Create a DataFrame for the matches
matches = pd.DataFrame(user_job_match, columns=['id', 'User_ID', 'Skill_Match'])
# Filter for significant matches (e.g., skill match score of at least 2)
matches = matches[matches['Skill_Match'] >= 2]
# Merge job and user data with the match information
merged_data = matches.merge(jobs, left_on='id', right_on='id').merge(users, left_on='User_ID', right_on='user_id')
# Create a pivot table for recommendations
pivot_table = merged_data.pivot_table(columns='User_ID', index='id', values='Skill_Match')
pivot_table.fillna(0, inplace=True) # Replace NaN values with 0
# Machine learning part: Use NearestNeighbors for clustering and finding similar job recommendations
job_sparse = csr_matrix(pivot_table)
model = NearestNeighbors(algorithm='brute')
model.fit(job_sparse)
# Example: finding the nearest jobs for a specific user (you can adjust the user and job indices here)
distance, suggestion = model.kneighbors(pivot_table.iloc[0, :].values.reshape(1, -1), n_neighbors=6)
# Save the model and data for future use
pickle.dump(model, open('artifacts/skill_match_model.pkl', 'wb'))
pickle.dump(pivot_table.index, open('artifacts/job_ids.pkl', 'wb'))
pickle.dump(matches, open('artifacts/skill_matches.pkl', 'wb'))
print("Recommendation system model created and saved!")