Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
Determine cover image file extension from response header
Browse files Browse the repository at this point in the history
Fixes #29

Add User-Agent to fix 403 Client Error

When no User-Agent is provided, Wikipedia returns a 403 status code:

requests.exceptions.HTTPError: 403 Client Error: Forbidden. Please comply with the User-Agent policy: https://meta.wikimedia.org/wiki/User-Agent_policy for url: https://upload.wikimedia.org/wikipedia/commons/3/3f/JPEG_example_flower.jpg
  • Loading branch information
ptrstn committed Mar 13, 2022
1 parent a21ccd9 commit dc39500
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 11 deletions.
27 changes: 18 additions & 9 deletions dailyblink/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import cloudscraper
from bs4 import BeautifulSoup
from cloudscraper import CloudflareChallengeError
from requests import Response

from dailyblink.media import (
set_m4a_meta_data,
Expand All @@ -15,7 +16,7 @@
)
from dailyblink.settings import (
BASE_URL,
COVER_FILE_NAME,
COVER_FILE_NAME_STEM,
PLAYLIST_FILE_NAME,
LANGUAGES,
MAX_CLOUDFLARE_ATTEMPTS,
Expand Down Expand Up @@ -75,22 +76,23 @@ def _download_daily_blinks(self, language_code, base_path):
valid_author = re.sub(r"([^\s\w]|_)+", "", blink_info["author"])
book_path = base_path / language / f"{date.today()} - {valid_title}"

print("Saving book text...")
markdown_text = _create_markdown_text(blink_info, chapters)
markdown_book_path = book_path / f"{valid_title} - {valid_author}.md"
save_text(text=markdown_text, file_path=markdown_book_path)

print("Saving book cover...")
cover_response = self.scraper.get(blink_info["cover_url"])
file_extension = _determine_file_extension(cover_response)
cover = cover_response.content
cover_path = book_path / COVER_FILE_NAME
cover_path = book_path / f"{COVER_FILE_NAME_STEM}.{file_extension}"
save_media(media=cover, file_path=cover_path)

print("Saving book text...")
markdown_text = _create_markdown_text(blink_info, chapters, cover_path)
markdown_book_path = book_path / f"{valid_title} - {valid_author}.md"
save_text(text=markdown_text, file_path=markdown_book_path)

try:
file_list = []
for number, chapter_id in enumerate(chapter_ids):
status = f"Saving audio track #{number + 1} - {chapters[number][0]}"
status_limited = f"{status[:MAX_LINE_LENGTH-3]}..."
status_limited = f"{status[:MAX_LINE_LENGTH - 3]}..."
print(status_limited)
file_name = f"{number:02d} - {valid_title}.m4a"
file_path = book_path / file_name
Expand Down Expand Up @@ -191,7 +193,7 @@ def _create_blink_info(response_text):
}


def _create_markdown_text(blink_info, chapters, cover_path=COVER_FILE_NAME):
def _create_markdown_text(blink_info, chapters, cover_path):
markdown_text = f"# {blink_info['title']}\n\n"
markdown_text += f"_{blink_info['author']}_\n\n"
markdown_text += f"{blink_info['read_time']}\n\n"
Expand All @@ -210,3 +212,10 @@ def _create_markdown_text(blink_info, chapters, cover_path=COVER_FILE_NAME):

markdown_text += f"Source: {blink_info['url']}\n\n"
return markdown_text


def _determine_file_extension(cover_response: Response) -> str:
headers = cover_response.headers
content_type = headers["content-type"]
_, file_extension = content_type.split("/")
return file_extension
2 changes: 1 addition & 1 deletion dailyblink/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

BASE_URL = "https://www.blinkist.com"

COVER_FILE_NAME = "cover.jpg"
COVER_FILE_NAME_STEM = "cover"
PLAYLIST_FILE_NAME = "playlist.m3u"
BLINKS_DEFAULT_PATH = pathlib.Path.home() / "Musik"
BLINKS_DIR_NAME = "blinks"
Expand Down
33 changes: 32 additions & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import pathlib

import requests

from dailyblink.core import (
save_media,
BlinkistScraper,
_create_markdown_text,
_determine_file_extension,
)
from dailyblink.media import save_text

Expand Down Expand Up @@ -73,5 +76,33 @@ def test_save_book_text():
blink_info = blinkist_scraper._get_daily_blink_info(language="de")
blink_url = blink_info["url"]
chapters = blinkist_scraper._request_blinkist_book_text(blink_url)["chapters"]
markdown_text = _create_markdown_text(blink_info, chapters)
markdown_text = _create_markdown_text(
blink_info, chapters, "test_output/cover.jpeg"
)
save_text(markdown_text, file_path="test_output/daily_blink.md")


def test_determine_file_extension():
headers = {
"user-agent": (
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/98.0.4758.80 Safari/537.36"
),
}

jpg_url = (
"https://upload.wikimedia.org/wikipedia/commons/3/3f/JPEG_example_flower.jpg"
)
jpg_response = requests.get(jpg_url, headers=headers)
assert _determine_file_extension(jpg_response) == "jpeg"

png_url = "https://upload.wikimedia.org/wikipedia/de/b/bb/Png-logo.png"
png_response = requests.get(png_url, headers=headers)
assert _determine_file_extension(png_response) == "png"

cover_url = (
"https://images.blinkist.io/images/books/5e48b5726cee070006209356/1_1/470.jpg"
)
cover_response = requests.get(cover_url, headers=headers)
assert _determine_file_extension(cover_response) == "png"

0 comments on commit dc39500

Please sign in to comment.