flathunters · sorgfresser · Jan 21, 2024 · Jan 21, 2024 · Feb 9, 2024 · codders
diff --git a/Pipfile b/Pipfile
@@ -32,7 +32,8 @@ pyright = "*"
 types-beautifulsoup4 = "*"
 selenium = "*"
 gunicorn = "21.2.0"
-flask-api = {editable = true, ref = "bugfix/159/remove-werkzeug-deprecated-calls", git = "git+https://github.com/codders/flask-api.git"}
+flask-api = {editable = true, git = "git+https://github.com/flask-api/flask-api.git"}
+openai = "*"
 
 [dev-packages]
 

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/config.yaml.dist b/config.yaml.dist
@@ -94,6 +94,7 @@ filters:
 #	- {price}: Price for the flat
 # 	- {durations}: Durations calculated by GMaps, see above
 #	- {url}: URL to the expose
+#   - {application}: The generated application text
 message: |
     {title}
     Zimmer: {rooms}
@@ -195,6 +196,17 @@ slack:
 #      host: 127.0.0.1
 #      port: 8080
 
+# Sending autogenerated texts with your notifications requires access to
+# the OpenAI API.
+
+# openai:
+#   api_key: sk-...
+#   language: en
+#   enable: False
+#   template: |
+#       Insert multistring template here. Use [ ] to mark placeholders that will be replaced by the LLM.
+openai:
+
 # If you are deploying to google cloud,
 # uncomment this and set it to your project id. More info in the readme.
 # google_cloud_project_id: my-flathunters-project-id

diff --git a/flathunt.py b/flathunt.py
@@ -10,7 +10,7 @@
 from flathunter.argument_parser import parse
 from flathunter.logging import logger, configure_logging
 from flathunter.idmaintainer import IdMaintainer
-from flathunter.hunter import Hunter
+from flathunter.detailed_hunter import OpenAIHunter
 from flathunter.config import Config
 from flathunter.heartbeat import Heartbeat
 from flathunter.time_utils import wait_during_period
@@ -31,7 +31,7 @@ def launch_flat_hunt(config, heartbeat: Heartbeat):
 
     wait_during_period(time_from, time_till)
 
-    hunter = Hunter(config, id_watch)
+    hunter = OpenAIHunter(config, id_watch)
     hunter.hunt_flats()
     counter = 0
 

diff --git a/flathunter/config.py b/flathunter/config.py
@@ -66,6 +66,10 @@ class Env:
     FLATHUNTER_MATTERMOST_WEBHOOK_URL = _read_env(
         "FLATHUNTER_MATTERMOST_WEBHOOK_URL")
     FLATHUNTER_SLACK_WEBHOOK_URL = _read_env("FLATHUNTER_SLACK_WEBHOOK_URL")
+    FLATHUNTER_OPENAI_API_KEY = _read_env("FLATHUNTER_OPENAI_API_KEY")
+    FLATHUNTER_OPENAI_LANGUAGE = _read_env("FLATHUNTER_OPENAI_LANGUAGE")
+    FLATHUNTER_OPENAI_ENABLE = _read_env("FLATHUNTER_OPENAI_ENABLE")
+    FLATHUNTER_OPENAI_TEMPLATE = _read_env("FLATHUNTER_OPENAI_TEMPLATE")
 
     # Filters
     FLATHUNTER_FILTER_EXCLUDED_TITLES = _read_env(
@@ -274,6 +278,22 @@ def slack_webhook_url(self):
         """Webhook for sending Slack messages"""
         return self._read_yaml_path('slack.webhook_url', "")
 
+    def openai_api_key(self):
+        """API Key for OpenAI"""
+        return self._read_yaml_path('openai.api_key', None)
+
+    def openai_language(self):
+        """Language to generate text in"""
+        return self._read_yaml_path('openai.language', "en")
+
+    def openai_enabled(self):
+        """True if OpenAI text generation is enabled"""
+        return bool(self._read_yaml_path('openai.enable', False))
+
+    def openai_template(self):
+        """Template for OpenAI text generation"""
+        return self._read_yaml_path('openai.template', "")
+
     def apprise_urls(self):
         """Notification URLs for Apprise"""
         return self._read_yaml_path('apprise', [])
@@ -366,6 +386,9 @@ def __repr__(self):
             "slack_webhook_url": self.slack_webhook_url(),
             "telegram_receiver_ids": self.telegram_receiver_ids(),
             "telegram_bot_token": elide(self.telegram_bot_token()),
+            "openai_api_key": elide(self.openai_api_key()),
+            "openai_language": self.openai_language(),
+            "openai_enabled": self.openai_enabled(),
             "target_urls": self.target_urls(),
             "use_proxy": self.use_proxy(),
         })
@@ -514,6 +537,26 @@ def slack_webhook_url(self):
             return Env.FLATHUNTER_SLACK_WEBHOOK_URL
         return super().slack_webhook_url()
 
+    def openai_api_key(self) -> Optional[str]:
+        if Env.FLATHUNTER_OPENAI_API_KEY is not None:
+            return Env.FLATHUNTER_OPENAI_API_KEY
+        return super().openai_api_key()
+
+    def openai_language(self) -> str:
+        if Env.FLATHUNTER_OPENAI_LANGUAGE is not None:
+            return Env.FLATHUNTER_OPENAI_LANGUAGE
+        return super().openai_language()
+
+    def openai_enabled(self) -> bool:
+        if Env.FLATHUNTER_OPENAI_ENABLE is not None:
+            return str(Env.FLATHUNTER_OPENAI_ENABLE) == 'true'
+        return super().openai_enabled()
+
+    def openai_template(self) -> str:
+        if Env.FLATHUNTER_OPENAI_TEMPLATE is not None:
+            return Env.FLATHUNTER_OPENAI_TEMPLATE
+        return super().openai_template()
+
     def excluded_titles(self):
         if Env.FLATHUNTER_FILTER_EXCLUDED_TITLES is not None:
             return Env.FLATHUNTER_FILTER_EXCLUDED_TITLES.split(";")

diff --git a/flathunter/crawler/immobilienscout.py b/flathunter/crawler/immobilienscout.py
@@ -181,12 +181,26 @@ def get_page(self, search_url, driver=None, page_no=None):
 
     def get_expose_details(self, expose):
         """Loads additional details for an expose by processing the expose detail URL"""
-        soup = self.get_soup_from_url(expose['url'])
+        driver = self.get_driver()
+        if driver is not None:
+            original_window = driver.current_window_handle
+            driver.switch_to.new_window('tab')
+        soup = self.get_soup_from_url(expose['url'], driver=driver)
         date = soup.find('dd', {"class": "is24qa-bezugsfrei-ab"})
+        description = soup.find('pre', {"class": "is24qa-objektbeschreibung"})
+        name_elems = soup.find_all(lambda e: e.has_attr('class') and len(e['class']) > 0 and 'realtorInfoNameAndRatingContainer' in e['class'][0])
+        # Get the first child of the 0th elem
+        name_elem = name_elems[0].findChildren()[0] if name_elems else None
+        name = name_elem.text.strip() if name_elem is not None else ''
+        expose['description'] = description.text.strip() if description is not None else ''
         expose['from'] = datetime.datetime.now().strftime("%2d.%2m.%Y")
+        expose['lessor'] = name
         if date is not None:
             if not re.match(r'.*sofort.*', date.text):
                 expose['from'] = date.text.strip()
+        if driver is not None:
+            driver.close()
+            driver.switch_to.window(original_window)
         return expose
 
     # pylint: disable=too-many-locals

diff --git a/flathunter/detailed_hunter.py b/flathunter/detailed_hunter.py
@@ -0,0 +1,36 @@
+"""Flathunter implementation for detailed exposé with OpenAI text generation."""
+from flathunter.config import YamlConfig
+from flathunter.logging import logger
+from flathunter.hunter import Hunter
+from flathunter.filter import Filter
+from flathunter.processor import ProcessorChain
+from flathunter.exceptions import BotBlockedException, UserDeactivatedException
+
+class OpenAIHunter(Hunter):
+    """Flathunter implementation for detailed exposé with OpenAI text generation"""
+
+
+    def hunt_flats(self, max_pages=None):
+        """Crawl, process and filter exposes"""
+        filter_set = Filter.builder() \
+                           .read_config(self.config) \
+                           .filter_already_seen(self.id_watch) \
+                           .build()
+
+        processor_chain = ProcessorChain.builder(self.config) \
+                                        .save_all_exposes(self.id_watch) \
+                                        .apply_filter(filter_set) \
+                                        .crawl_expose_details() \
+                                        .resolve_addresses() \
+                                        .calculate_durations() \
+                                        .generate_text() \
+                                        .send_messages() \
+                                        .build()
+
+        result = []
+        # We need to iterate over this list to force the evaluation of the pipeline
+        for expose in processor_chain.process(self.crawl_for_exposes(max_pages)):
+            logger.info('New offer: %s', expose['title'])
+            result.append(expose)
+
+        return result
diff --git a/flathunter/openai_processor.py b/flathunter/openai_processor.py
@@ -0,0 +1,52 @@
+"""Generate a text for an exposé with the help of OpenAI's API"""
+import datetime
+import time
+from urllib.parse import quote_plus
+import requests
+from openai import OpenAI
+
+from flathunter.logging import logger
+from flathunter.abstract_processor import Processor
+
+class OpenAIProcessor(Processor):
+    """Implementation of Processor class to calculate travel durations"""
+
+
+    def __init__(self, config):
+        self.config = config
+        self.client = OpenAI(api_key=self.config.openai_api_key())
+        self.language = self.config.openai_language()
+
+    def process_expose(self, expose):
+        """Calculate the durations for an exposé"""
+        expose['generated_text'] = self.generate_query_text(expose['rooms'], expose['description'], expose['lessor']).strip()
+        return expose
+
+    def generate_query_text(self, rooms, description, lessor) -> str:
+        """Generate a text for an exposé with the help of OpenAI's API"""
+        response = self.__openai_request(rooms, description, lessor)
+        return response
+
+    def __openai_request(self, room_count, description, lessor) -> str:
+        """Send a request to OpenAI's API"""
+        chat_completion = self.client.chat.completions.create(
+            messages = [
+                {
+                    "role": "system",
+                    "content": f"You are helping in generating an application for an exposé. For this, you will be provided with a dictionary which contains information on the kind of flat you are applying for and a prewritten text. The application is supposed to be written in {self.language}. Fill in the blanks, marked by [] in the text with the information from the dictionary. Do not directly copy text (except room information or similar things) from the dictionary - instead, paraphrase it."
+                },
+                {
+                    "role": "user",
+                    "content": f"Template:\n\n{self.config.openai_template()}"
+                },
+                {
+                    "role": "user",
+                    "content": f"Dictionary:\n\nrooms: {room_count},\nlessor to write to:{lessor},\ndescription: {description}"
+                }
+            ],
+            model="gpt-3.5-turbo"
+        )
+        logger.info("Requested text generation from OpenAI API successfully")
+        logger.debug("Exposé: %s", description)
+        logger.debug("OpenAI response: %s", chat_completion)
+        return chat_completion.choices[0].message.content
diff --git a/flathunter/processor.py b/flathunter/processor.py
@@ -11,6 +11,7 @@
 from flathunter.sender_telegram import SenderTelegram
 from flathunter.sender_slack import SenderSlack
 from flathunter.gmaps_duration_processor import GMapsDurationProcessor
+from flathunter.openai_processor import OpenAIProcessor
 from flathunter.idmaintainer import SaveAllExposesProcessor
 from flathunter.abstract_processor import Processor
 
@@ -48,6 +49,12 @@ def calculate_durations(self):
             self.processors.append(GMapsDurationProcessor(self.config))
         return self
 
+    def generate_text(self):
+        """Add processor to generate text, if enabled"""
+        if self.config.openai_enabled():
+            self.processors.append(OpenAIProcessor(self.config))
+        return self
+
     def crawl_expose_details(self):
         """Add processor to crawl expose details"""
         self.processors.append(CrawlExposeDetails(self.config))

diff --git a/flathunter/sender_telegram.py b/flathunter/sender_telegram.py
@@ -173,5 +173,6 @@ def __get_text_message(self, expose: Dict) -> str:
             price=expose.get('price', 'N/A'),
             url=expose.get('url', 'N/A'),
             address=expose.get('address', 'N/A'),
-            durations=expose.get('durations', 'N/A')
+            durations=expose.get('durations', 'N/A'),
+            application=expose.get('generated_text', 'N/A')
         ).strip()