Fix FetchTweets by switching from twitterscraper to twint 🎉

LinqLover · LinqLover · commit 3a8fe16e0ae4 · 2020-11-05T14:18:48.000+01:00
See twintproject/twint#604 (comment) for more information.
diff --git a/docker/requirements.txt b/docker/requirements.txt
@@ -26,8 +26,7 @@ validators==0.18.1
 
 # Data sources
 google-api-python-client==1.12.5
-git+https://github.com/Museum-Barberini-gGmbH/twitterscraper.git#egg=twitterscraper
-twint==2.1.20
+git+https://github.com/twintproject/twint.git#egg=twint
 
 # Analysis tools
 git+https://github.com/rwalk/gsdmm.git#egg=gsdmm
diff --git a/src/twitter.py b/src/twitter.py
@@ -1,13 +1,14 @@
 """Provides tasks for downloading tweets related to the museum."""
 
 import datetime as dt
+import dateutil
 from pytz import utc
 import re
 
 import luigi
 from luigi.format import UTF8
 import pandas as pd
-import twitterscraper as ts
+import twint
 import tzlocal
 
 from _utils import CsvToDb, DataPreparationTask, MuseumFacts, logger
@@ -114,7 +115,7 @@ def output(self):
 
 
 class FetchTwitter(DataPreparationTask):
-    """Fetch tweets related to the museum using the twitterscraper."""
+    """Fetch tweets related to the museum using twint."""
 
     query = luigi.Parameter(default="museumbarberini")
     timespan = luigi.parameter.TimeDeltaParameter(
@@ -132,12 +133,30 @@ def run(self):
         if self.minimal_mode:
             timespan = dt.timedelta(days=5)
 
-        tweets = ts.query_tweets(
-            self.query,
-            begindate=dt.date.today() - timespan,
-            enddate=dt.date.today() + dt.timedelta(days=1))
+        tweets: twint.tweet.tweet = []
+        twint.run.Search(twint.Config(
+            Search=self.query,
+            Since=str(dt.date.today() - timespan),
+            Until=str(dt.date.today() + dt.timedelta(days=1)),
+            Limit=10000,
+            Store_object=True,
+            Store_object_tweets_list=tweets,
+            Hide_output=True
+        ))
         if tweets:
-            df = pd.DataFrame([tweet.__dict__ for tweet in tweets])
+            df = pd.DataFrame([
+                dict(
+                    user_id=tweet.user_id,
+                    tweet_id=tweet.id,
+                    text=tweet.tweet,
+                    parent_tweet_id=None, # TODO: Nuke
+                    timestamp=dateutil.parser.parse(tweet.datetime),
+                    likes=tweet.likes_count,
+                    retweets=tweet.retweets_count,
+                    replies=tweet.replies_count
+                )
+                for tweet in tweets
+            ])
         else:  # no tweets returned, ensure schema
             df = pd.DataFrame(columns=[
                 'user_id',
@@ -149,29 +168,8 @@ def run(self):
                 'retweets',
                 'replies'])
 
-        # Filter out false positive matches. This is oviously a workaround,
-        # but at the moment cheaper than repairing or switching the scraper.
-        # See #352.
-        is_false_positive = ~(
-            df['parent_tweet_id'].apply(bool)
-            | df['text'].str.contains(self.query, flags=re.IGNORECASE)
-            | df['screen_name'].str.contains(self.query, flags=re.IGNORECASE))
-        if is_false_positive.any():
-            false_positives = df[is_false_positive]
-            logger.warning(
-                f"Dropping {len(false_positives)} tweets that are not "
-                f"related to the query"
-            )
-            df = df[~is_false_positive]
-
         df = df.drop_duplicates(subset=['tweet_id'])
 
-        # timestamp is utc by default
-        df['timestamp'] = df['timestamp'].apply(
-            lambda utc_dt:
-            utc.localize(utc_dt, is_dst=None).astimezone(
-                tzlocal.get_localzone()))
-
         with self.output().open('w') as output_file:
             df.to_csv(output_file, index=False, header=True)