Skip to content

Commit

Permalink
Merge branch 'OxFF00FF-feature_images_metadata'
Browse files Browse the repository at this point in the history
  • Loading branch information
ZhiyuanChen committed Jul 29, 2024
2 parents 4273064 + 393f843 commit f177cdc
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 24 deletions.
27 changes: 4 additions & 23 deletions icrawler/builtin/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,31 +141,12 @@ def feed(self, keyword, offset, max_num, language=None, filters=None):
self.logger.debug(f"put url to url_queue: {url}")


class GoogleParser(Parser):
def parse(self, response):
soup = BeautifulSoup(response.content.decode("utf-8", "ignore"), "lxml")
# image_divs = soup.find_all('script')
image_divs = soup.find_all(name="script")
for div in image_divs:
# txt = div.text
txt = str(div)
# if not txt.startswith('AF_initDataCallback'):
# if "AF_initDataCallback" not in txt:
# continue
# if "ds:0" in txt or "ds:1" not in txt:
# continue
# txt = re.sub(r"^AF_initDataCallback\({.*key: 'ds:(\d)'.+data:function\(\){return (.+)}}\);?$",
# "\\2", txt, 0, re.DOTALL)
# meta = json.loads(txt)
# data = meta[31][0][12][2]
# uris = [img[1][3][0] for img in data if img[0] == 1]

uris = re.findall(r"http[^\[]*?.(?:jpg|png|bmp)", txt)
if not uris:
uris = re.findall(r"http[^\[]*?\.(?:jpg|png|bmp)", txt)
uris = [bytes(uri, "utf-8").decode("unicode-escape") for uri in uris]
if uris:
return [{"file_url": uri} for uri in uris]
uris = [uri for div in soup.find_all(name="script") for uri in re.findall(r"http[^\[]*?\.(?:jpg|png|bmp)", str(div))]
uris = [bytes(uri, 'utf-8').decode('unicode-escape') for uri in uris]
if uris:
return [{"file_url": uri} for uri in uris]


class GoogleImageCrawler(Crawler):
Expand Down
5 changes: 4 additions & 1 deletion icrawler/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,10 @@ def download(self, task, default_ext, timeout=5, max_retry=3, overwrite=False, *
self.fetched_num += 1
filename = self.get_filename(task, default_ext)
self.logger.info("image #%s\t%s", self.fetched_num, file_url)
self.storage.write(filename, response.content)

# self.storage.write(filename, response.content)
self.storage.PIL_image_save(filename, response)

task["success"] = True
task["filename"] = filename
break
Expand Down
48 changes: 48 additions & 0 deletions icrawler/storage/filesystem.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,32 @@
import logging
import os
import os.path as osp

import six

from .base import BaseStorage

from io import BytesIO
from PIL import Image
from PIL.ExifTags import TAGS
import piexif


def read_EXIF(image_path):
try:
image = Image.open(image_path)
exif_data = image.getexif()
exif = ''
for tag_id, value in exif_data.items():
tag_description = TAGS.get(tag_id, tag_id)
if isinstance(value, bytes):
value = value.decode(errors='replace')
exif += f"{tag_description:20}: {value}\n"
return exif

except Exception as e:
return e


class FileSystem(BaseStorage):
"""Use filesystem as storage backend.
Expand All @@ -15,6 +37,32 @@ class FileSystem(BaseStorage):
def __init__(self, root_dir):
self.root_dir = root_dir

def PIL_image_save(self, id, data):
"""
For download images with metadata
Image title (ImageDescription) = original image filename
Author (Artist) = image url
"""
image = Image.open(BytesIO(data.content))
exif_dict = image.info.get('exif')
if exif_dict:
exif_dict = piexif.load(exif_dict)
else:
exif_dict = {"0th": {}, "Exif": {}, "GPS": {}, "Interop": {}, "1st": {}, "thumbnail": None}
exif_dict['0th'][piexif.ImageIFD.Artist] = data.url.encode('utf-8')
exif_dict['0th'][piexif.ImageIFD.ImageDescription] = data.url.split('/')[-1].encode('utf-8')
exif_bytes = piexif.dump(exif_dict)

filepath = osp.join(self.root_dir, id)
folder = osp.dirname(filepath)
if not osp.isdir(folder):
try:
os.makedirs(folder)
except OSError:
pass
image.save(filepath, exif=exif_bytes)
print(f"{read_EXIF(filepath)}")

def write(self, id, data):
filepath = osp.join(self.root_dir, id)
folder = osp.dirname(filepath)
Expand Down

0 comments on commit f177cdc

Please sign in to comment.