From c2cdbc9e991b2b62925ff9d67025283ba7fd61ff Mon Sep 17 00:00:00 2001 From: nikita Date: Thu, 4 Apr 2024 10:03:41 +0600 Subject: [PATCH 1/5] add PIL_image_save method add read_EXIF function --- icrawler/storage/filesystem.py | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/icrawler/storage/filesystem.py b/icrawler/storage/filesystem.py index d375dd2..325619d 100644 --- a/icrawler/storage/filesystem.py +++ b/icrawler/storage/filesystem.py @@ -5,6 +5,26 @@ from .base import BaseStorage +from PIL import Image +from PIL.ExifTags import TAGS +import piexif + + +def read_EXIF(image_path): + try: + image = Image.open(image_path) + exif_data = image.getexif() + exif = '' + for tag_id, value in exif_data.items(): + tag_description = TAGS.get(tag_id, tag_id) + if isinstance(value, bytes): + value = value.decode(errors='replace') + exif += f"{tag_description:20}: {value}\n" + return exif + + except Exception as e: + return e + class FileSystem(BaseStorage): """Use filesystem as storage backend. @@ -12,6 +32,31 @@ class FileSystem(BaseStorage): The id is filename and data is stored as text files or binary files. """ + def PIL_image_save(self, id, data): + """ + For download images with metadata + image title - original image name + author - image url + """ + image = Image.open(BytesIO(data.content)) + exif_dict = image.info.get('exif') + if exif_dict: + exif_dict = piexif.load(exif_dict) + else: + exif_dict = {"0th": {}, "Exif": {}, "GPS": {}, "Interop": {}, "1st": {}, "thumbnail": None} + exif_dict['0th'][piexif.ImageIFD.Artist] = data.url.encode('utf-8') + exif_dict['0th'][piexif.ImageIFD.ImageDescription] = data.url.split('/')[-1].encode('utf-8') + exif_bytes = piexif.dump(exif_dict) + + filepath = osp.join(self.root_dir, id) + folder = osp.dirname(filepath) + if not osp.isdir(folder): + try: + os.makedirs(folder) + except OSError: + pass + image.save(filepath, exif=exif_bytes) + def __init__(self, root_dir): self.root_dir = root_dir From 4282075eb7efef96898ba5ab580e569dcb343728 Mon Sep 17 00:00:00 2001 From: nikita Date: Thu, 4 Apr 2024 10:16:48 +0600 Subject: [PATCH 2/5] save images with metadata --- icrawler/downloader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/icrawler/downloader.py b/icrawler/downloader.py index db536f3..8d84b9b 100644 --- a/icrawler/downloader.py +++ b/icrawler/downloader.py @@ -136,7 +136,10 @@ def download(self, task, default_ext, timeout=5, max_retry=3, overwrite=False, * self.fetched_num += 1 filename = self.get_filename(task, default_ext) self.logger.info("image #%s\t%s", self.fetched_num, file_url) - self.storage.write(filename, response.content) + + # self.storage.write(filename, response.content) + self.storage.PIL_image_save(filename, response) + task["success"] = True task["filename"] = filename break From 15905dfc8b187ed45776f42d7f2de71aa5bc53c0 Mon Sep 17 00:00:00 2001 From: nikita Date: Thu, 4 Apr 2024 10:17:22 +0600 Subject: [PATCH 3/5] change parse function --- icrawler/builtin/google.py | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/icrawler/builtin/google.py b/icrawler/builtin/google.py index f663c6d..29df646 100644 --- a/icrawler/builtin/google.py +++ b/icrawler/builtin/google.py @@ -141,26 +141,11 @@ def feed(self, keyword, offset, max_num, language=None, filters=None): self.logger.debug(f"put url to url_queue: {url}") -class GoogleParser(Parser): def parse(self, response): soup = BeautifulSoup(response.content.decode("utf-8", "ignore"), "lxml") - # image_divs = soup.find_all('script') - image_divs = soup.find_all(name="script") - for div in image_divs: - # txt = div.text - txt = str(div) - # if not txt.startswith('AF_initDataCallback'): - if "AF_initDataCallback" not in txt: - continue - if "ds:0" in txt or "ds:1" not in txt: - continue - # txt = re.sub(r"^AF_initDataCallback\({.*key: 'ds:(\d)'.+data:function\(\){return (.+)}}\);?$", - # "\\2", txt, 0, re.DOTALL) - # meta = json.loads(txt) - # data = meta[31][0][12][2] - # uris = [img[1][3][0] for img in data if img[0] == 1] - - uris = re.findall(r"http[^\[]*?\.(?:jpg|png|bmp)", txt) + uris = [uri for div in soup.find_all(name="script") for uri in re.findall(r"http[^\[]*?\.(?:jpg|png|bmp)", str(div))] + uris = [bytes(uri, 'utf-8').decode('unicode-escape') for uri in uris] + if uris: return [{"file_url": uri} for uri in uris] From 9b773343ce0ee6820977389bd340438038fd6afa Mon Sep 17 00:00:00 2001 From: nikita Date: Fri, 26 Jul 2024 12:10:04 +0600 Subject: [PATCH 4/5] change PIL_image_save function --- icrawler/storage/filesystem.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/icrawler/storage/filesystem.py b/icrawler/storage/filesystem.py index 325619d..54f25b8 100644 --- a/icrawler/storage/filesystem.py +++ b/icrawler/storage/filesystem.py @@ -1,3 +1,4 @@ +import logging import os import os.path as osp @@ -5,6 +6,7 @@ from .base import BaseStorage +from io import BytesIO from PIL import Image from PIL.ExifTags import TAGS import piexif @@ -32,11 +34,14 @@ class FileSystem(BaseStorage): The id is filename and data is stored as text files or binary files. """ + def __init__(self, root_dir): + self.root_dir = root_dir + def PIL_image_save(self, id, data): """ For download images with metadata - image title - original image name - author - image url + Image title (ImageDescription) = original image filename + Author (Artist) = image url """ image = Image.open(BytesIO(data.content)) exif_dict = image.info.get('exif') @@ -56,9 +61,7 @@ def PIL_image_save(self, id, data): except OSError: pass image.save(filepath, exif=exif_bytes) - - def __init__(self, root_dir): - self.root_dir = root_dir + print(f"{read_EXIF(filepath)}") def write(self, id, data): filepath = osp.join(self.root_dir, id) From 393f843c2a103f076138ba54b706dbff548779d7 Mon Sep 17 00:00:00 2001 From: Zhiyuan Chen Date: Mon, 29 Jul 2024 20:58:25 +0800 Subject: [PATCH 5/5] fix Parser class does not accept kwargs, #128 Signed-off-by: Zhiyuan Chen --- icrawler/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/icrawler/parser.py b/icrawler/parser.py index e185631..a3b2a96 100644 --- a/icrawler/parser.py +++ b/icrawler/parser.py @@ -22,9 +22,9 @@ class Parser(ThreadPool): lock: A threading.Lock object. """ - def __init__(self, thread_num, signal, session): + def __init__(self, thread_num, signal, session, in_queue=None, out_queue=None, name="parser"): """Init Parser with some shared variables.""" - super().__init__(thread_num, name="parser") + super().__init__(thread_num, in_queue=in_queue, out_queue=out_queue, name=name) self.signal = signal self.session = session