-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmcq_generator.py
377 lines (317 loc) · 13.7 KB
/
mcq_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
import pke
from textwrap3 import wrap
import random
import numpy as np
import nltk
nltk_data_dir = r"./" # Current directory
nltk.data.path.append(nltk_data_dir)
import nltk_downloader # For downloading the nltk files, this is one time, you can comment it once you have downloaded the files
from sklearn.metrics.pairwise import cosine_similarity
import torch
from sense2vec import Sense2Vec
from similarity.normalized_levenshtein import NormalizedLevenshtein
from models_initiator import summary_model,question_model,summary_tokenizer,question_tokenizer,sentence_transformer_model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# We need to download the 2015 trained on reddit sense2vec model as it is shown to give better results than the 2019 one.
s2v = Sense2Vec().from_disk('s2v_old')
def set_seed(seed: int):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def postprocesstext(content):
"""
this function takes a piece of text (content), tokenizes it into sentences, capitalizes the first letter of each sentence, and then concatenates the processed sentences into a single string, which is returned as the final result. The purpose of this function could be to format the input content by ensuring that each sentence starts with an uppercase letter.
"""
final = ""
for sent in sent_tokenize(content):
sent = sent.capitalize()
final = final + " " + sent
return final
def summarizer(text, model, tokenizer):
"""
This function takes the given text along with the model and tokenizer, which summarize the large text into useful information
"""
text = text.strip().replace("\n", " ")
text = "summarize: " + text
# print (text)
max_len = int(len(text.split()) * 0.5)
encoding = tokenizer.encode_plus(
text,
max_length=max_len,
pad_to_max_length=False,
truncation=True,
return_tensors="pt",
).to(device)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
outs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
early_stopping=True,
num_beams=8,
num_return_sequences=5,
no_repeat_ngram_size=4,
min_length=75,
max_length=300,
)
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
summary = dec[0]
summary = postprocesstext(summary)
summary = summary.strip()
return summary
def get_nouns_multipartite(content):
"""
This function takes the content text given and then outputs the phrases which are build around the nouns , so that we can use them for context based distractors
"""
out = []
try:
extractor = pke.unsupervised.MultipartiteRank()
extractor.load_document(input=content, language="en")
pos = {"PROPN", "NOUN", "ADJ", "VERB", "ADP", "ADV", "DET", "CONJ", "NUM", "PRON", "X"}
stoplist = list(string.punctuation)
stoplist += ["-lrb-", "-rrb-", "-lcb-", "-rcb-", "-lsb-", "-rsb-"]
stoplist += stopwords.words("english")
extractor.candidate_selection(pos=pos)
extractor.candidate_weighting(alpha=1.1, threshold=0.75, method="average")
# Increase the number of keywords extracted
keyphrases = extractor.get_n_best(n=50) # Increase n to 50 or more to generate more MCQs
for val in keyphrases:
out.append(val[0])
except Exception as e:
print(e)
out = []
return out
def get_keywords(originaltext):
"""
This function takes the original text and the summary text and generates keywords from both which ever are more relevant
This is done by checking the keywords generated from the original text to those generated from the summary, so that we get important ones
"""
keywords = get_nouns_multipartite(originaltext)
# print ("keywords unsummarized: ",keywords)
# keyword_processor = KeywordProcessor()
# for keyword in keywords:
# keyword_processor.add_keyword(keyword)
# keywords_found = keyword_processor.extract_keywords(summarytext)
# keywords_found = list(set(keywords_found))
# print ("keywords_found in summarized: ",keywords_found)
# important_keywords =[]
# for keyword in keywords:
# if keyword in keywords_found:
# important_keywords.append(keyword)
# return important_keywords
return keywords
def get_question(context, answer, model, tokenizer):
"""
This function takes the input context text, pretrained model along with the tokenizer and the keyword and the answer and then generates the question from the large paragraph
"""
text = "context: {} answer: {}".format(context, answer)
encoding = tokenizer.encode_plus(
text,
max_length=384,
pad_to_max_length=False,
truncation=True,
return_tensors="pt",
).to(device)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
outs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
early_stopping=True,
num_beams=8,
num_return_sequences=5,
no_repeat_ngram_size=4,
max_length=72,
)
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
Question = dec[0].replace("question:", "")
Question = Question.strip()
return Question
def filter_same_sense_words(original, wordlist):
"""
This is used to filter the words which are of same sense, where it takes the wordlist which has the sense of the word attached as the string along with the word itself.
"""
filtered_words = []
base_sense = original.split("|")[1]
# print (base_sense)
for eachword in wordlist:
if eachword[0].split("|")[1] == base_sense:
filtered_words.append(
eachword[0].split("|")[0].replace("_", " ").title().strip()
)
return filtered_words
def get_highest_similarity_score(wordlist, wrd):
"""
This function takes the given word along with the wordlist and then gives out the max-score which is the levenshtein distance for the wrong answers
because we need the options which are very different from one another but relating to the same context.
"""
score = []
normalized_levenshtein = NormalizedLevenshtein()
for each in wordlist:
score.append(normalized_levenshtein.similarity(each.lower(), wrd.lower()))
return max(score)
def sense2vec_get_words(word, s2v, topn, question):
"""
This function takes the input word, sentence to vector model and top similar words and also the question
Then it computes the sense of the given word
then it gets the words which are of same sense but are most similar to the given word
after that we we return the list of words which satisfy the above mentioned criteria
"""
output = []
# print ("word ",word)
try:
sense = s2v.get_best_sense(
word,
senses=[
"NOUN",
"PERSON",
"PRODUCT",
"LOC",
"ORG",
"EVENT",
"NORP",
"WORK OF ART",
"FAC",
"GPE",
"NUM",
"FACILITY",
],
)
most_similar = s2v.most_similar(sense, n=topn)
# print (most_similar)
output = filter_same_sense_words(sense, most_similar)
# print ("Similar ",output)
except:
output = []
threshold = 0.6
final = [word]
checklist = question.split()
for x in output:
if (
get_highest_similarity_score(final, x) < threshold
and x not in final
and x not in checklist
):
final.append(x)
return final[1:]
def mmr(doc_embedding, word_embeddings, words, top_n, lambda_param):
"""
The mmr function takes document and word embeddings, along with other parameters, and uses the Maximal Marginal Relevance (MMR) algorithm to extract a specified number of keywords/keyphrases from the document. The MMR algorithm balances the relevance of keywords with their diversity, helping to select keywords that are both informative and distinct from each other.
"""
# Extract similarity within words, and between words and the document
word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
word_similarity = cosine_similarity(word_embeddings)
# Initialize candidates and already choose best keyword/keyphrase
keywords_idx = [np.argmax(word_doc_similarity)]
candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
for _ in range(top_n - 1):
# Extract similarities within candidates and
# between candidates and selected keywords/phrases
candidate_similarities = word_doc_similarity[candidates_idx, :]
target_similarities = np.max(
word_similarity[candidates_idx][:, keywords_idx], axis=1
)
# Calculate MMR
mmr = (lambda_param) * candidate_similarities - (
1 - lambda_param
) * target_similarities.reshape(-1, 1)
mmr_idx = candidates_idx[np.argmax(mmr)]
# Update keywords & candidates
keywords_idx.append(mmr_idx)
candidates_idx.remove(mmr_idx)
return [words[idx] for idx in keywords_idx]
def get_distractors_wordnet(word):
"""
the get_distractors_wordnet function uses WordNet to find a relevant synset for the input word and then generates distractor words by looking at hyponyms of the hypernym associated with the input word. These distractors are alternative words related to the input word and can be used, for example, in educational or language-related applications to provide choices for a given word.
"""
distractors = []
try:
syn = wn.synsets(word, "n")[0]
word = word.lower()
orig_word = word
if len(word.split()) > 0:
word = word.replace(" ", "_")
hypernym = syn.hypernyms()
if len(hypernym) == 0:
return distractors
for item in hypernym[0].hyponyms():
name = item.lemmas()[0].name()
# print ("name ",name, " word",orig_word)
if name == orig_word:
continue
name = name.replace("_", " ")
name = " ".join(w.capitalize() for w in name.split())
if name is not None and name not in distractors:
distractors.append(name)
except:
print("Wordnet distractors not found")
return distractors
def get_distractors(
word, origsentence, sense2vecmodel, sentencemodel, top_n, lambdaval
):
"""
this function generates distractor words (answer choices) for a given target word in the context of a provided sentence. It selects distractors based on their similarity to the target word's context and ensures that the target word itself is not included among the distractors. This function is useful for creating multiple-choice questions or answer options in natural language processing tasks.
"""
distractors = sense2vec_get_words(word, sense2vecmodel, top_n, origsentence)
# print ("distractors ",distractors)
if len(distractors) == 0:
return distractors
distractors_new = [word.capitalize()]
distractors_new.extend(distractors)
# print ("distractors_new .. ",distractors_new)
embedding_sentence = origsentence + " " + word.capitalize()
# embedding_sentence = word
keyword_embedding = sentencemodel.encode([embedding_sentence])
distractor_embeddings = sentencemodel.encode(distractors_new)
# filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors,4,0.7)
max_keywords = min(len(distractors_new), 5)
filtered_keywords = mmr(
keyword_embedding,
distractor_embeddings,
distractors_new,
max_keywords,
lambdaval,
)
# filtered_keywords = filtered_keywords[1:]
final = [word.capitalize()]
for wrd in filtered_keywords:
if wrd.lower() != word.lower():
final.append(wrd.capitalize())
final = final[1:]
return final
def get_mca_questions(context: str):
"""
this function generates multiple-choice questions based on a given context. It summarizes the context,
extracts important keywords, generates questions related to those keywords, and provides randomized
answer choices, including the correct answer, for each question.
"""
summarized_text = summarizer(context, summary_model, summary_tokenizer)
# imp_keywords = get_keywords(context ,summarized_text)
imp_keywords = get_keywords(context)
output_list = []
for answer in imp_keywords:
output = ""
ques = get_question(summarized_text, answer, question_model, question_tokenizer)
distractors = get_distractors(
answer.capitalize(), ques, s2v, sentence_transformer_model, 40, 0.2
)
output = output + ques + "\n"
if len(distractors) == 0:
distractors = imp_keywords
if len(distractors) > 0:
random_integer = random.randint(0, 3)
alpha_list = ["(a)", "(b)", "(c)", "(d)"]
for d, distractor in enumerate(distractors[:4]):
if d == random_integer:
output = output + alpha_list[d] + answer + "\n"
else:
output = output + alpha_list[d] + distractor + "\n"
output = (
output + "Correct answer is : " + alpha_list[random_integer] + "\n\n"
)
output_list.append(output)
mca_questions = output_list
return mca_questions