diff --git a/api/mlApi/download.py b/api/mlApi/data_preprocess.py similarity index 100% rename from api/mlApi/download.py rename to api/mlApi/data_preprocess.py diff --git a/api/mlApi/termsMlModel.py b/api/mlApi/termsMlModel.py deleted file mode 100644 index bd98839..0000000 --- a/api/mlApi/termsMlModel.py +++ /dev/null @@ -1,74 +0,0 @@ -import torch -from torch.utils.data import DataLoader, Dataset -from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score - -def load_dataset(dataset_path): - # Replace this with your actual dataset loading code - import pandas as pd - df = pd.read_csv(dataset_path) - texts = df['text'].tolist() - labels = df['label'].tolist() - return texts, labels - -def preprocess_data(texts, labels, max_length): - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt') - dataset = DarkPatternDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels)) - return dataset - -def train_model(model, train_dataloader, val_dataloader, epochs, learning_rate): - optimizer = AdamW(model.parameters(), lr=learning_rate) - total_steps = len(train_dataloader) * epochs - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) - - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model.to(device) - model.train() - - for epoch in range(epochs): - for batch in train_dataloader: - input_ids = batch['input_ids'].to(device) - attention_mask = batch['attention_mask'].to(device) - labels = batch['labels'].to(device) - - optimizer.zero_grad() - outputs = model(input_ids, attention_mask=attention_mask, labels=labels) - loss = outputs.loss - loss.backward() - optimizer.step() - scheduler.step() - -def evaluate_model(model, val_dataloader, device='cpu'): - model.eval() - val_predictions = [] - val_true_labels = [] - - with torch.no_grad(): - for batch in val_dataloader: - input_ids = batch['input_ids'].to(device) - attention_mask = batch['attention_mask'].to(device) - labels = batch['labels'].to(device) - - outputs = model(input_ids, attention_mask=attention_mask) - logits = outputs.logits - predictions = torch.argmax(logits, dim=1).cpu().numpy() - - val_predictions.extend(predictions) - val_true_labels.extend(labels.cpu().numpy()) - - accuracy = accuracy_score(val_true_labels, val_predictions) - print(f'Validation Accuracy: {accuracy * 100:.2f}%') - -class DarkPatternDataset(Dataset): - def __init__(self, input_ids, attention_mask, labels): - self.input_ids = input_ids - self.attention_mask = attention_mask - self.labels = labels - - def __len__(self): - return len(self.input_ids) - - def __getitem__(self, idx): - return {'input_ids': self.input_ids[idx].squeeze(), 'attention_mask': self.attention_mask[idx].squeeze(), 'labels': self.labels[idx]} diff --git a/api/mlApi/termsMlTrain.py b/api/mlApi/termsMlTrain.py deleted file mode 100644 index 1e34223..0000000 --- a/api/mlApi/termsMlTrain.py +++ /dev/null @@ -1,41 +0,0 @@ -import torch -from torch.utils.data import DataLoader, Dataset -from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score - -from termsMlModel import load_dataset, preprocess_data, train_model, evaluate_model - -# Task 1: Load your dataset -dataset_path = 'path/to/your_dataset.csv' # Replace with the path to your dataset -texts, labels = load_dataset(dataset_path) - -# Task 2: Configure model parameters -model_name = 'bert-base-uncased' -num_labels = 2 # Binary classification (dark pattern or not) -batch_size = 8 -max_length = 512 # Adjust based on your dataset and model's input size - -# Task 3: Preprocess the data -# Split the dataset -texts_train, texts_val, labels_train, labels_val = train_test_split(texts, labels, test_size=0.1, random_state=42) - -# Preprocess the training data -train_dataset = preprocess_data(texts_train, labels_train, max_length) -train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - -# Preprocess the validation data -val_dataset = preprocess_data(texts_val, labels_val, max_length) -val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) - -# Set up the BERT model -model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) - -# Train the model -epochs = 3 -learning_rate = 2e-5 -train_model(model, train_dataloader, val_dataloader, epochs, learning_rate) - -# Evaluate the model -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -evaluate_model(model, val_dataloader, device)