forked from jakobowsky/otomoto-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjob_crawler2.py
90 lines (77 loc) · 3.47 KB
/
job_crawler2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from bs4 import BeautifulSoup
import requests
from dataclasses import dataclass, asdict
from typing import List
import csv
@dataclass
class Vacancy:
title: str
company: str
location: str
def fetch_html(url: str) -> BeautifulSoup:
"""Fetch HTML content from a given URL and return the BeautifulSoup object."""
response = requests.get(url)
response.raise_for_status() # Raise an exception for unsuccessful requests
return BeautifulSoup(response.content, 'html.parser')
def extract_job_categories(soup: BeautifulSoup) -> List[str]:
"""Extract job category URLs from the main RocketJobs.pl page."""
categories_div = soup.select_one('div.css-4k04cs div.css-1k0qr6v div.css-1jmm92x div.css-5hh74a div.css-138ir5g div.css-1sg36r4 div.css-14rp0mq div.css-bi6kiv div.css-8e6m8o')
if categories_div:
return [category['href'] for category in categories_div.select('li a')]
return []
def extract_job_postings(soup: BeautifulSoup) -> List[Vacancy]:
"""Extract job postings from the BeautifulSoup object."""
job_postings = []
job_listings = soup.find_all('div', class_='css-1sg36r4')
for job_listing in job_listings:
title_element = job_listing.find('h3', class_='css-162lmgr')
company_element = job_listing.find('span', class_='css-5yy118')
location_element = job_listing.find('span', class_='css-5aysv1')
if title_element and company_element and location_element:
job_postings.append(Vacancy(
title=title_element.text.strip(),
company=company_element.text.strip(),
location=location_element.text.strip()
))
return job_postings
def get_pagination_links(soup: BeautifulSoup) -> List[str]:
"""Extract pagination links from the BeautifulSoup object."""
pagination = soup.find("ul", class_="css-musp26")
if pagination:
return [link["href"] for link in pagination.find_all("a")]
return []
def crawl_rocketjobs(url: str) -> List[Vacancy]:
"""Crawl RocketJobs.pl to extract job listings, handling pagination."""
all_jobs = []
try:
soup = fetch_html(url)
job_postings = extract_job_postings(soup)
all_jobs.extend(job_postings)
pagination_links = get_pagination_links(soup)
for page_link in pagination_links[1:]: # Skip the first link (current page)
full_url = f"{url}{page_link}"
soup = fetch_html(full_url)
job_postings = extract_job_postings(soup)
all_jobs.extend(job_postings)
except requests.exceptions.RequestException as e:
print(f"An error occurred while crawling the URL: {e}")
return all_jobs
def save_to_csv(vacancies: List[Vacancy], filename: str):
"""Save a list of Vacancy instances to a CSV file."""
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['title', 'company', 'location']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for vacancy in vacancies:
writer.writerow(asdict(vacancy))
# Example usage
starting_url = 'https://rocketjobs.pl/wszystkie-lokalizacje/marketing/ecommerce'
job_postings = crawl_rocketjobs(starting_url)
# Print or process the extracted job postings
for job in job_postings:
print(f"Title: {job.title}")
print(f"Company: {job.company}")
print(f"Location: {job.location}")
print("---")
# Save the job postings to a CSV file
save_to_csv(job_postings, 'job_postings.csv')