-
Notifications
You must be signed in to change notification settings - Fork 183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add autogenerated messages #518
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -181,12 +181,26 @@ def get_page(self, search_url, driver=None, page_no=None): | |
|
||
def get_expose_details(self, expose): | ||
"""Loads additional details for an expose by processing the expose detail URL""" | ||
soup = self.get_soup_from_url(expose['url']) | ||
driver = self.get_driver() | ||
if driver is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So do I understand from this that you need to open a second tab to load the expose to get the details? Is there a reason to do this here instead of in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, the whole |
||
original_window = driver.current_window_handle | ||
driver.switch_to.new_window('tab') | ||
soup = self.get_soup_from_url(expose['url'], driver=driver) | ||
date = soup.find('dd', {"class": "is24qa-bezugsfrei-ab"}) | ||
description = soup.find('pre', {"class": "is24qa-objektbeschreibung"}) | ||
name_elems = soup.find_all(lambda e: e.has_attr('class') and len(e['class']) > 0 and 'realtorInfoNameAndRatingContainer' in e['class'][0]) | ||
# Get the first child of the 0th elem | ||
name_elem = name_elems[0].findChildren()[0] if name_elems else None | ||
name = name_elem.text.strip() if name_elem is not None else '' | ||
expose['description'] = description.text.strip() if description is not None else '' | ||
expose['from'] = datetime.datetime.now().strftime("%2d.%2m.%Y") | ||
expose['lessor'] = name | ||
if date is not None: | ||
if not re.match(r'.*sofort.*', date.text): | ||
expose['from'] = date.text.strip() | ||
if driver is not None: | ||
driver.close() | ||
driver.switch_to.window(original_window) | ||
return expose | ||
|
||
# pylint: disable=too-many-locals | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
"""Flathunter implementation for detailed exposé with OpenAI text generation.""" | ||
from flathunter.config import YamlConfig | ||
from flathunter.logging import logger | ||
from flathunter.hunter import Hunter | ||
from flathunter.filter import Filter | ||
from flathunter.processor import ProcessorChain | ||
from flathunter.exceptions import BotBlockedException, UserDeactivatedException | ||
|
||
class OpenAIHunter(Hunter): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks fine but per my other comment I think I would prefer just to change the default Hunter. |
||
"""Flathunter implementation for detailed exposé with OpenAI text generation""" | ||
|
||
|
||
def hunt_flats(self, max_pages=None): | ||
"""Crawl, process and filter exposes""" | ||
filter_set = Filter.builder() \ | ||
.read_config(self.config) \ | ||
.filter_already_seen(self.id_watch) \ | ||
.build() | ||
|
||
processor_chain = ProcessorChain.builder(self.config) \ | ||
.save_all_exposes(self.id_watch) \ | ||
.apply_filter(filter_set) \ | ||
.crawl_expose_details() \ | ||
.resolve_addresses() \ | ||
.calculate_durations() \ | ||
.generate_text() \ | ||
.send_messages() \ | ||
.build() | ||
|
||
result = [] | ||
# We need to iterate over this list to force the evaluation of the pipeline | ||
for expose in processor_chain.process(self.crawl_for_exposes(max_pages)): | ||
logger.info('New offer: %s', expose['title']) | ||
result.append(expose) | ||
|
||
return result |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
"""Generate a text for an exposé with the help of OpenAI's API""" | ||
import datetime | ||
import time | ||
from urllib.parse import quote_plus | ||
import requests | ||
from openai import OpenAI | ||
|
||
from flathunter.logging import logger | ||
from flathunter.abstract_processor import Processor | ||
|
||
class OpenAIProcessor(Processor): | ||
"""Implementation of Processor class to calculate travel durations""" | ||
|
||
|
||
def __init__(self, config): | ||
self.config = config | ||
self.client = OpenAI(api_key=self.config.openai_api_key()) | ||
self.language = self.config.openai_language() | ||
|
||
def process_expose(self, expose): | ||
"""Calculate the durations for an exposé""" | ||
expose['generated_text'] = self.generate_query_text(expose['rooms'], expose['description'], expose['lessor']).strip() | ||
return expose | ||
|
||
def generate_query_text(self, rooms, description, lessor) -> str: | ||
"""Generate a text for an exposé with the help of OpenAI's API""" | ||
response = self.__openai_request(rooms, description, lessor) | ||
return response | ||
|
||
def __openai_request(self, room_count, description, lessor) -> str: | ||
"""Send a request to OpenAI's API""" | ||
chat_completion = self.client.chat.completions.create( | ||
messages = [ | ||
{ | ||
"role": "system", | ||
"content": f"You are helping in generating an application for an exposé. For this, you will be provided with a dictionary which contains information on the kind of flat you are applying for and a prewritten text. The application is supposed to be written in {self.language}. Fill in the blanks, marked by [] in the text with the information from the dictionary. Do not directly copy text (except room information or similar things) from the dictionary - instead, paraphrase it." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just wondering if it makes sense also to have a German option here - presumably a lot of people what to generate their texts in German. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting idea! I'm currently running this with a german template and it works just fine - nonetheless, I can add an option for German if you'd like me to. |
||
}, | ||
{ | ||
"role": "user", | ||
"content": f"Template:\n\n{self.config.openai_template()}" | ||
}, | ||
{ | ||
"role": "user", | ||
"content": f"Dictionary:\n\nrooms: {room_count},\nlessor to write to:{lessor},\ndescription: {description}" | ||
} | ||
], | ||
model="gpt-3.5-turbo" | ||
) | ||
logger.info("Requested text generation from OpenAI API successfully") | ||
logger.debug("Exposé: %s", description) | ||
logger.debug("OpenAI response: %s", chat_completion) | ||
return chat_completion.choices[0].message.content |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you want to create an
OpenAIHunter
here, instead of just addinggenerate_text
to the filters for the defaultHunter
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi! Sorry it's been a while...
It's not just about the
generate_text
, it's mostly aboutcrawl_expose_details
which is needed for the description of an exposé and the description in turn is needed to extract the features worth mentioning in an application (at least that was my idea).I do agree that changing the Hunter is silly and maybe the whole Hunter itself, but I did not like the idea of crawling details everytime.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As another idea: I could introduce some additional logic in
generate_text()
for this. I was thinking something likeHow about this to prevent the need for a new hunter?