-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummary.py
86 lines (61 loc) · 3.1 KB
/
summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import os
from urllib.parse import urlparse, unquote
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
from transformers import BartForConditionalGeneration, BartTokenizer
def get_article_text(url):
response = requests.get(url)
encoding = response.encoding if 'charset' in response.headers.get('content-type', '').lower() else 'utf-8'
decoded_content = response.content.decode(encoding)
soup = BeautifulSoup(decoded_content, 'html.parser')
title = soup.find('title').text
paragraphs = soup.find_all('p')
article_text = '\n'.join([p.get_text() for p in paragraphs])
return title, article_text
# old version
# def summarize_text(text, max_length=5000):
# summarizer = pipeline("summarization")
# summary = summarizer(text, max_length=max_length, do_sample=False, clean_up_tokenization_spaces=True)
# return summary[0]['summary_text']
def summarize_text_pt(text, max_length=512):
model_name = "facebook/mbart-large-50"
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)
input_ids = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=1024).input_ids
model.config.decoder_start_token_id = tokenizer.lang_code_to_id["pt_XX"]
summary_ids = model.generate(input_ids, num_beams=4, length_penalty=2.0, max_length=max_length, min_length=50)
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary_text
def summarize_text(text, max_length=512):
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
input_ids = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=1024).input_ids
summary_ids = model.generate(input_ids, num_beams=4, length_penalty=2.0, max_length=max_length, min_length=50)
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary_text
def summarize_article(url, language="en"):
title, article_text = get_article_text(url)
if language == "en":
summarized_text = summarize_text(article_text)
elif language == "pt":
summarized_text = summarize_text_pt(article_text)
else:
raise ValueError(f"Unsupported language: {language}")
return title, summarized_text
def write_summarized_to_file(summarized_text, url):
# Parse the URL and extract the path
parsed_url = urlparse(url)
path = parsed_url.path
# Remove the leading/trailing slashes and split the path by slashes
path_parts = path.strip('/').split('/')
# Get the last part of the path and decode any URL-encoded characters
last_part = unquote(path_parts[-1])
# Create the filename with a .txt extension
filename = f"{last_part}.txt"
# Write the summarized text to the file
with open(filename, "w") as file:
file.write(summarized_text)
print(f"Summarized text saved to {filename}")