-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapper.py
118 lines (103 loc) · 4.82 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import json
from collections import OrderedDict
from datetime import datetime, timedelta
import os
def get_review(div: BeautifulSoup) -> dict:
"""
Receives an HTML div or review, and it will return a dictionary with the review's information.
"""
review = {}
Rating = {}
try:
name = div.find('p', class_ = '_1p30XHjz2rI- C7Tp-bANpE4-') # For getting the name of Customers
review["Name"] = name.text
except Exception as e:
review["Name"] = "N/A"
try:
content = div.find("span", class_ = "l9bbXUdC9v0- ZatlKKd1hyc- ukvN6yaH1Ds-") # For getting the Content of Customers
review["Content"] = content.text
except Exception as e:
review["Content"] = "N/A"
try:
date = div.find("p", class_ = "iLkEeQbexGs-").text # For getting the Date of Review
if "Dined on " in date:
review["Date"] = date.removeprefix("Dined on ")
else: # If the date is similar to "Dined 6 days ago" then this case will handle that
formatted_date = datetime.now().date() - timedelta(days=int(date[6]))
review["Date"] = formatted_date.strftime("%B %d, %Y")
except Exception as e:
review["Date"] = "N/A"
try:
ratings = div.find_all("li" , class_ = "-k5xpTfSXac-") # Getting Each Sub review
for rating in ratings:
rating_type, rating_number = rating.text.split(' ')
Rating[rating_type] = int(rating_number)
except Exception as e:
review["Rating"] = {}
review["Rating"] = Rating
return review
# Saving the Reviews in the JSON File
def save_to_file(file_path :str,reviews: dict):
with open(file_path, "w") as json_file:
json.dump(reviews, json_file, indent=4)
print(f"All reviews Saved to {file_path}")
return file_path
def get_dict(file_path: str)->OrderedDict:
try:
with open(file_path, 'r') as file:
return OrderedDict(json.load(file))
except FileNotFoundError:
print(f"Reviews file not found. Please ensure '{file_path}' exists.")
return OrderedDict()
def scrape_website(url: str):
"""
Given a URL, it saves the reviews of that website in JSON file named with the title of that website
Returns the JSON file name
"""
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url)
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script("return document.readyState") == "complete")
driver.execute_script("window.scrollTo(0,5000);")
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "_1BEc9Aeng-Q-"))) # Next Button Location
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "gRlAhBweGeE-")))
reviews = OrderedDict()
title = driver.find_element(By.XPATH, "/html/body/div[1]/div/div/main/div/div[1]/div[2]/div/div/div/h1").text
title = title.replace(" ","_")
if title + "_reviews.json" in os.listdir("."):
driver.quit()
return title + "_reviews.json"
total_pages = int(driver.find_element(By.CLASS_NAME, "_1BEc9Aeng-Q-").find_element(By.XPATH, ".//div/ul/li[5]/a").text)
page_number = 1
number = 0
while(True):
# print(f"Scrapping Page {page_number} of 635")
page_number += 1
response = BeautifulSoup(driver.page_source, 'html.parser')
contents = response.find_all('li', class_ = "afkKaa-4T28-")
for review in contents:
number += 1
reviews["Customer " + str(number)] = get_review(review)
try:
next_button = (driver.find_element(By.CLASS_NAME, "_1BEc9Aeng-Q-")).find_element(By.XPATH, ".//div/div[2]/a")
next_button.click()
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "afkKaa-4T28-")))
except Exception as e:
if page_number < total_pages:
driver.refresh()
driver.execute_script("window.scrollTo(0,5000);")
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "_1BEc9Aeng-Q-"))) # Next Button Locationr
page_number -= 1
else:
print(e)
break
driver.quit()
# Saving the result in JSON
file_path = title + "_reviews.json"
return save_to_file(file_path,reviews)