Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add autogenerated messages #518

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ pyright = "*"
types-beautifulsoup4 = "*"
selenium = "*"
gunicorn = "21.2.0"
flask-api = {editable = true, ref = "bugfix/159/remove-werkzeug-deprecated-calls", git = "git+https://github.com/codders/flask-api.git"}
flask-api = {editable = true, git = "git+https://github.com/flask-api/flask-api.git"}
openai = "*"

[dev-packages]

Expand Down
1,167 changes: 693 additions & 474 deletions Pipfile.lock

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions config.yaml.dist
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ filters:
# - {price}: Price for the flat
# - {durations}: Durations calculated by GMaps, see above
# - {url}: URL to the expose
# - {application}: The generated application text
message: |
{title}
Zimmer: {rooms}
Expand Down Expand Up @@ -195,6 +196,17 @@ slack:
# host: 127.0.0.1
# port: 8080

# Sending autogenerated texts with your notifications requires access to
# the OpenAI API.

# openai:
# api_key: sk-...
# language: en
# enable: False
# template: |
# Insert multistring template here. Use [ ] to mark placeholders that will be replaced by the LLM.
openai:

# If you are deploying to google cloud,
# uncomment this and set it to your project id. More info in the readme.
# google_cloud_project_id: my-flathunters-project-id
Expand Down
4 changes: 2 additions & 2 deletions flathunt.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from flathunter.argument_parser import parse
from flathunter.logging import logger, configure_logging
from flathunter.idmaintainer import IdMaintainer
from flathunter.hunter import Hunter
from flathunter.detailed_hunter import OpenAIHunter
from flathunter.config import Config
from flathunter.heartbeat import Heartbeat
from flathunter.time_utils import wait_during_period
Expand All @@ -31,7 +31,7 @@ def launch_flat_hunt(config, heartbeat: Heartbeat):

wait_during_period(time_from, time_till)

hunter = Hunter(config, id_watch)
hunter = OpenAIHunter(config, id_watch)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you want to create an OpenAIHunter here, instead of just adding generate_text to the filters for the default Hunter?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi! Sorry it's been a while...
It's not just about the generate_text, it's mostly about crawl_expose_details which is needed for the description of an exposé and the description in turn is needed to extract the features worth mentioning in an application (at least that was my idea).
I do agree that changing the Hunter is silly and maybe the whole Hunter itself, but I did not like the idea of crawling details everytime.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As another idea: I could introduce some additional logic in generate_text() for this. I was thinking something like

    def generate_text(self):
        """Add processor to generate text, if enabled"""
        if self.config.openai_enabled():
            if not any([isinstance(processor, CrawlExposeDetails) for processor in self.processors]):
                self.crawl_expose_details()
            self.processors.append(OpenAIProcessor(self.config))
        return self

How about this to prevent the need for a new hunter?

hunter.hunt_flats()
counter = 0

Expand Down
43 changes: 43 additions & 0 deletions flathunter/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ class Env:
FLATHUNTER_MATTERMOST_WEBHOOK_URL = _read_env(
"FLATHUNTER_MATTERMOST_WEBHOOK_URL")
FLATHUNTER_SLACK_WEBHOOK_URL = _read_env("FLATHUNTER_SLACK_WEBHOOK_URL")
FLATHUNTER_OPENAI_API_KEY = _read_env("FLATHUNTER_OPENAI_API_KEY")
FLATHUNTER_OPENAI_LANGUAGE = _read_env("FLATHUNTER_OPENAI_LANGUAGE")
FLATHUNTER_OPENAI_ENABLE = _read_env("FLATHUNTER_OPENAI_ENABLE")
FLATHUNTER_OPENAI_TEMPLATE = _read_env("FLATHUNTER_OPENAI_TEMPLATE")

# Filters
FLATHUNTER_FILTER_EXCLUDED_TITLES = _read_env(
Expand Down Expand Up @@ -274,6 +278,22 @@ def slack_webhook_url(self):
"""Webhook for sending Slack messages"""
return self._read_yaml_path('slack.webhook_url', "")

def openai_api_key(self):
"""API Key for OpenAI"""
return self._read_yaml_path('openai.api_key', None)

def openai_language(self):
"""Language to generate text in"""
return self._read_yaml_path('openai.language', "en")

def openai_enabled(self):
"""True if OpenAI text generation is enabled"""
return bool(self._read_yaml_path('openai.enable', False))

def openai_template(self):
"""Template for OpenAI text generation"""
return self._read_yaml_path('openai.template', "")

def apprise_urls(self):
"""Notification URLs for Apprise"""
return self._read_yaml_path('apprise', [])
Expand Down Expand Up @@ -366,6 +386,9 @@ def __repr__(self):
"slack_webhook_url": self.slack_webhook_url(),
"telegram_receiver_ids": self.telegram_receiver_ids(),
"telegram_bot_token": elide(self.telegram_bot_token()),
"openai_api_key": elide(self.openai_api_key()),
"openai_language": self.openai_language(),
"openai_enabled": self.openai_enabled(),
"target_urls": self.target_urls(),
"use_proxy": self.use_proxy(),
})
Expand Down Expand Up @@ -514,6 +537,26 @@ def slack_webhook_url(self):
return Env.FLATHUNTER_SLACK_WEBHOOK_URL
return super().slack_webhook_url()

def openai_api_key(self) -> Optional[str]:
if Env.FLATHUNTER_OPENAI_API_KEY is not None:
return Env.FLATHUNTER_OPENAI_API_KEY
return super().openai_api_key()

def openai_language(self) -> str:
if Env.FLATHUNTER_OPENAI_LANGUAGE is not None:
return Env.FLATHUNTER_OPENAI_LANGUAGE
return super().openai_language()

def openai_enabled(self) -> bool:
if Env.FLATHUNTER_OPENAI_ENABLE is not None:
return str(Env.FLATHUNTER_OPENAI_ENABLE) == 'true'
return super().openai_enabled()

def openai_template(self) -> str:
if Env.FLATHUNTER_OPENAI_TEMPLATE is not None:
return Env.FLATHUNTER_OPENAI_TEMPLATE
return super().openai_template()

def excluded_titles(self):
if Env.FLATHUNTER_FILTER_EXCLUDED_TITLES is not None:
return Env.FLATHUNTER_FILTER_EXCLUDED_TITLES.split(";")
Expand Down
16 changes: 15 additions & 1 deletion flathunter/crawler/immobilienscout.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,26 @@ def get_page(self, search_url, driver=None, page_no=None):

def get_expose_details(self, expose):
"""Loads additional details for an expose by processing the expose detail URL"""
soup = self.get_soup_from_url(expose['url'])
driver = self.get_driver()
if driver is not None:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So do I understand from this that you need to open a second tab to load the expose to get the details? Is there a reason to do this here instead of in the OpenAIProcessor? If we have to do it here, can we have a switch to skip this if the OpenAI feature is disabled? Otherwise everybody not using OpenAI will make a bunch of unnecessary fetches.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, the whole get_expose_details is broken for me with immobilienscout if one does not add a driver. This is due to a request initially returning "Checking that you're not a robot" and only once this check is finished the final content of the page will be returned. But I can simply cherry-pick this into a different pull request.

original_window = driver.current_window_handle
driver.switch_to.new_window('tab')
soup = self.get_soup_from_url(expose['url'], driver=driver)
date = soup.find('dd', {"class": "is24qa-bezugsfrei-ab"})
description = soup.find('pre', {"class": "is24qa-objektbeschreibung"})
name_elems = soup.find_all(lambda e: e.has_attr('class') and len(e['class']) > 0 and 'realtorInfoNameAndRatingContainer' in e['class'][0])
# Get the first child of the 0th elem
name_elem = name_elems[0].findChildren()[0] if name_elems else None
name = name_elem.text.strip() if name_elem is not None else ''
expose['description'] = description.text.strip() if description is not None else ''
expose['from'] = datetime.datetime.now().strftime("%2d.%2m.%Y")
expose['lessor'] = name
if date is not None:
if not re.match(r'.*sofort.*', date.text):
expose['from'] = date.text.strip()
if driver is not None:
driver.close()
driver.switch_to.window(original_window)
return expose

# pylint: disable=too-many-locals
Expand Down
36 changes: 36 additions & 0 deletions flathunter/detailed_hunter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Flathunter implementation for detailed exposé with OpenAI text generation."""
from flathunter.config import YamlConfig
from flathunter.logging import logger
from flathunter.hunter import Hunter
from flathunter.filter import Filter
from flathunter.processor import ProcessorChain
from flathunter.exceptions import BotBlockedException, UserDeactivatedException

class OpenAIHunter(Hunter):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks fine but per my other comment I think I would prefer just to change the default Hunter.

"""Flathunter implementation for detailed exposé with OpenAI text generation"""


def hunt_flats(self, max_pages=None):
"""Crawl, process and filter exposes"""
filter_set = Filter.builder() \
.read_config(self.config) \
.filter_already_seen(self.id_watch) \
.build()

processor_chain = ProcessorChain.builder(self.config) \
.save_all_exposes(self.id_watch) \
.apply_filter(filter_set) \
.crawl_expose_details() \
.resolve_addresses() \
.calculate_durations() \
.generate_text() \
.send_messages() \
.build()

result = []
# We need to iterate over this list to force the evaluation of the pipeline
for expose in processor_chain.process(self.crawl_for_exposes(max_pages)):
logger.info('New offer: %s', expose['title'])
result.append(expose)

return result
52 changes: 52 additions & 0 deletions flathunter/openai_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Generate a text for an exposé with the help of OpenAI's API"""
import datetime
import time
from urllib.parse import quote_plus
import requests
from openai import OpenAI

from flathunter.logging import logger
from flathunter.abstract_processor import Processor

class OpenAIProcessor(Processor):
"""Implementation of Processor class to calculate travel durations"""


def __init__(self, config):
self.config = config
self.client = OpenAI(api_key=self.config.openai_api_key())
self.language = self.config.openai_language()

def process_expose(self, expose):
"""Calculate the durations for an exposé"""
expose['generated_text'] = self.generate_query_text(expose['rooms'], expose['description'], expose['lessor']).strip()
return expose

def generate_query_text(self, rooms, description, lessor) -> str:
"""Generate a text for an exposé with the help of OpenAI's API"""
response = self.__openai_request(rooms, description, lessor)
return response

def __openai_request(self, room_count, description, lessor) -> str:
"""Send a request to OpenAI's API"""
chat_completion = self.client.chat.completions.create(
messages = [
{
"role": "system",
"content": f"You are helping in generating an application for an exposé. For this, you will be provided with a dictionary which contains information on the kind of flat you are applying for and a prewritten text. The application is supposed to be written in {self.language}. Fill in the blanks, marked by [] in the text with the information from the dictionary. Do not directly copy text (except room information or similar things) from the dictionary - instead, paraphrase it."
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering if it makes sense also to have a German option here - presumably a lot of people what to generate their texts in German.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting idea! I'm currently running this with a german template and it works just fine - nonetheless, I can add an option for German if you'd like me to.

},
{
"role": "user",
"content": f"Template:\n\n{self.config.openai_template()}"
},
{
"role": "user",
"content": f"Dictionary:\n\nrooms: {room_count},\nlessor to write to:{lessor},\ndescription: {description}"
}
],
model="gpt-3.5-turbo"
)
logger.info("Requested text generation from OpenAI API successfully")
logger.debug("Exposé: %s", description)
logger.debug("OpenAI response: %s", chat_completion)
return chat_completion.choices[0].message.content
7 changes: 7 additions & 0 deletions flathunter/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from flathunter.sender_telegram import SenderTelegram
from flathunter.sender_slack import SenderSlack
from flathunter.gmaps_duration_processor import GMapsDurationProcessor
from flathunter.openai_processor import OpenAIProcessor
from flathunter.idmaintainer import SaveAllExposesProcessor
from flathunter.abstract_processor import Processor

Expand Down Expand Up @@ -48,6 +49,12 @@ def calculate_durations(self):
self.processors.append(GMapsDurationProcessor(self.config))
return self

def generate_text(self):
"""Add processor to generate text, if enabled"""
if self.config.openai_enabled():
self.processors.append(OpenAIProcessor(self.config))
return self

def crawl_expose_details(self):
"""Add processor to crawl expose details"""
self.processors.append(CrawlExposeDetails(self.config))
Expand Down
3 changes: 2 additions & 1 deletion flathunter/sender_telegram.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,5 +173,6 @@ def __get_text_message(self, expose: Dict) -> str:
price=expose.get('price', 'N/A'),
url=expose.get('url', 'N/A'),
address=expose.get('address', 'N/A'),
durations=expose.get('durations', 'N/A')
durations=expose.get('durations', 'N/A'),
application=expose.get('generated_text', 'N/A')
).strip()