-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_InstaCaptions.py
70 lines (52 loc) · 2.08 KB
/
scrape_InstaCaptions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import instaloader
import logging
import sys
import subprocess
import requests
import re
sys.path.append(subprocess.check_output('git rev-parse --show-toplevel'.split()).decode('utf-8').strip())
from scrapers.base.base_scraper import BaseScraper
from PIL import Image
from io import BytesIO
import pytesseract
def clean_text(text):
# Encode to UTF-8 and decode back to a string
text = text.encode("utf-8", "ignore").decode("utf-8")
# Remove newline characters
text = text.replace("\n", " ")
# Remove non-alphanumeric characters
text = re.sub(r'\W+', ' ', text)
# Limit to the first 10 words
words = text.split()
text = ' '.join(words[:12])
# Delete Words in Caps and the trailing spaces of the words
text = re.sub(r'\b[A-Z]+\b', '', text)
return text
def text_from_image(url):
image = Image.open(BytesIO(requests.get(url).content))
text = pytesseract.image_to_string(image, lang='deu')
return clean_text(text)
def base_metadata_dict(post: instaloader.Post) -> dict:
metadata = {"title": text_from_image(post.url),
"description": post.caption,
"url": post.url,
"date": post.date_utc.strftime("%Y-%m-%d"),
"kicker": post.caption_hashtags}
return metadata
class InstaScraper(BaseScraper):
def __init__(self):
super().__init__("dlf")
self.L = instaloader.Instaloader()
self.username = "nachrichtenleicht"
self.profile = instaloader.Profile.from_username(self.L.context, self.username)
def scrape(self):
for post in self.profile.get_posts():
if not self.data_handler.is_already_saved("easy", post.url):
if post.caption[:5] == "Unser":
continue
metadata = base_metadata_dict(post)
content = post.caption
self.data_handler.save_article("easy", metadata, content, post.url, download_audio=False)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
InstaScraper().scrape()