-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
87 lines (57 loc) · 2.03 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import urllib.request
import pathlib
from multiprocessing.dummy import Pool as ThreadPool
from urllib.request import urlopen
from bs4 import BeautifulSoup
from pdftotext import convert_pdf_to_txt
from data_extract import extractor
import os
import sys
def get_html(url):
# returns html of the webpage from url
fp = urllib.request.urlopen(url)
mybytes = fp.read()
mystr = mybytes.decode("utf8")
fp.close()
return mystr
def download_file(download_url,save_path):
#download binary files like pdf (not text)
try:
urllib.request.urlretrieve(download_url, save_path)
except:
pass
def convert_file(source,destination):
# convert pdf in desc path to txt file and store in source path
raw_text = convert_pdf_to_txt(source)
text_file = open(destination,"w")
text_file.write(raw_text)
text_file.close()
download_links = [] # list to store the download urls
no_of_colleges = 0 # initialize number of colleges to zero
result_url = input("Enter the url of result page: ")
# format should be https://www.example.com
html_doc = get_html(result_url)
# arrange html string using html parser
soup = BeautifulSoup(html_doc, 'html.parser')
for link in soup.find_all('a'):
link_url = link.get('href')
if("attachment" in link_url):
if("https://www.ktu.edu.in" not in link_url):
download_links.append("https://www.ktu.edu.in"+link_url)
else:
download_links.append(link_url)
print("Started downloading files...")
print("Please wait!")
print("Download links is {}".format(len(download_links)))
destination = ["{}/pdf/{}.pdf".format(".",i) for i in range(1,len(download_links)+1)]
pool = ThreadPool(100)
results = pool.starmap(download_file, zip(download_links, destination))
print("Started conversion of files...")
print("Please wait!")
for i in range(1,len(download_links)+1):
convert_file("./pdf/{}.pdf".format(i),"./text/{}.txt".format(i))
# os.remove("./pdf/{}.pdf".format(i))
extractor("./text/{}.txt".format(i))
# os.remove("./text/{}.txt".format(i))
print("Converted {}-".format(i))
print("Conversion completed and data extracted")