Skip to content

Commit

Permalink
fix GoogleCrawler
Browse files Browse the repository at this point in the history
  • Loading branch information
ZhiyuanChen committed May 15, 2024
1 parent f4ae779 commit e062fd0
Showing 1 changed file with 9 additions and 4 deletions.
13 changes: 9 additions & 4 deletions icrawler/builtin/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,21 @@ def parse(self, response):
# txt = div.text
txt = str(div)
# if not txt.startswith('AF_initDataCallback'):
if "AF_initDataCallback" not in txt:
continue
if "ds:0" in txt or "ds:1" not in txt:
continue
# if "AF_initDataCallback" not in txt:
# continue
# if "ds:0" in txt or "ds:1" not in txt:
# continue
# txt = re.sub(r"^AF_initDataCallback\({.*key: 'ds:(\d)'.+data:function\(\){return (.+)}}\);?$",
# "\\2", txt, 0, re.DOTALL)
# meta = json.loads(txt)
# data = meta[31][0][12][2]
# uris = [img[1][3][0] for img in data if img[0] == 1]

uris = re.findall(r"http[^\[]*?.(?:jpg|png|bmp)", txt)
uris = [bytes(uri, "utf-8").decode("unicode-escape") for uri in uris]
if uris:
return [{"file_url": uri} for uri in uris]

uris = re.findall(r"http[^\[]*?\.(?:jpg|png|bmp)", txt)
return [{"file_url": uri} for uri in uris]

Expand Down

0 comments on commit e062fd0

Please sign in to comment.