-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_text.py
33 lines (24 loc) · 1.05 KB
/
clean_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from pdfminer.high_level import extract_text
import re
# Define file paths
pdf_path = "data/Manly-P-Hall_The-Secret-Teachings-of-All-Ages.pdf"
output_txt_path = "data/secret_teachings_cleaned_pdfminer.txt"
def clean_text(text):
# Remove unwanted characters like newlines, extra spaces, and page numbers
text = re.sub(r"\s+", " ", text) # Replace multiple whitespace with a single space
text = re.sub(r"([0-9]+)", "", text) # Remove page numbers
text = re.sub(r"[^A-Za-z0-9,.?!:;\-\(\)\'\s]", "", text) # Keep basic punctuation
return text.strip()
def extract_text_from_pdf_with_pdfminer(pdf_path):
try:
text = extract_text(pdf_path)
return clean_text(text)
except Exception as e:
print(f"Failed to extract text: {e}")
return ""
# Extract the text using PDFMiner
text_data = extract_text_from_pdf_with_pdfminer(pdf_path)
# Save the cleaned text into a .txt file
with open(output_txt_path, "w", encoding="utf-8") as f:
f.write(text_data)
print(f"Text extracted and cleaned. Saved to {output_txt_path}.")