Skip to content

Commit 3a8fe16

Browse files
committedNov 5, 2020
Fix FetchTweets by switching from twitterscraper to twint 🎉
See twintproject/twint#604 (comment) for more information.
1 parent a1454bd commit 3a8fe16

File tree

2 files changed

+27
-30
lines changed

2 files changed

+27
-30
lines changed
 

‎docker/requirements.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ validators==0.18.1
2626

2727
# Data sources
2828
google-api-python-client==1.12.5
29-
git+https://github.com/Museum-Barberini-gGmbH/twitterscraper.git#egg=twitterscraper
30-
twint==2.1.20
29+
git+https://github.com/twintproject/twint.git#egg=twint
3130

3231
# Analysis tools
3332
git+https://github.com/rwalk/gsdmm.git#egg=gsdmm

‎src/twitter.py

+26-28
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
"""Provides tasks for downloading tweets related to the museum."""
22

33
import datetime as dt
4+
import dateutil
45
from pytz import utc
56
import re
67

78
import luigi
89
from luigi.format import UTF8
910
import pandas as pd
10-
import twitterscraper as ts
11+
import twint
1112
import tzlocal
1213

1314
from _utils import CsvToDb, DataPreparationTask, MuseumFacts, logger
@@ -114,7 +115,7 @@ def output(self):
114115

115116

116117
class FetchTwitter(DataPreparationTask):
117-
"""Fetch tweets related to the museum using the twitterscraper."""
118+
"""Fetch tweets related to the museum using twint."""
118119

119120
query = luigi.Parameter(default="museumbarberini")
120121
timespan = luigi.parameter.TimeDeltaParameter(
@@ -132,12 +133,30 @@ def run(self):
132133
if self.minimal_mode:
133134
timespan = dt.timedelta(days=5)
134135

135-
tweets = ts.query_tweets(
136-
self.query,
137-
begindate=dt.date.today() - timespan,
138-
enddate=dt.date.today() + dt.timedelta(days=1))
136+
tweets: twint.tweet.tweet = []
137+
twint.run.Search(twint.Config(
138+
Search=self.query,
139+
Since=str(dt.date.today() - timespan),
140+
Until=str(dt.date.today() + dt.timedelta(days=1)),
141+
Limit=10000,
142+
Store_object=True,
143+
Store_object_tweets_list=tweets,
144+
Hide_output=True
145+
))
139146
if tweets:
140-
df = pd.DataFrame([tweet.__dict__ for tweet in tweets])
147+
df = pd.DataFrame([
148+
dict(
149+
user_id=tweet.user_id,
150+
tweet_id=tweet.id,
151+
text=tweet.tweet,
152+
parent_tweet_id=None, # TODO: Nuke
153+
timestamp=dateutil.parser.parse(tweet.datetime),
154+
likes=tweet.likes_count,
155+
retweets=tweet.retweets_count,
156+
replies=tweet.replies_count
157+
)
158+
for tweet in tweets
159+
])
141160
else: # no tweets returned, ensure schema
142161
df = pd.DataFrame(columns=[
143162
'user_id',
@@ -149,29 +168,8 @@ def run(self):
149168
'retweets',
150169
'replies'])
151170

152-
# Filter out false positive matches. This is oviously a workaround,
153-
# but at the moment cheaper than repairing or switching the scraper.
154-
# See #352.
155-
is_false_positive = ~(
156-
df['parent_tweet_id'].apply(bool)
157-
| df['text'].str.contains(self.query, flags=re.IGNORECASE)
158-
| df['screen_name'].str.contains(self.query, flags=re.IGNORECASE))
159-
if is_false_positive.any():
160-
false_positives = df[is_false_positive]
161-
logger.warning(
162-
f"Dropping {len(false_positives)} tweets that are not "
163-
f"related to the query"
164-
)
165-
df = df[~is_false_positive]
166-
167171
df = df.drop_duplicates(subset=['tweet_id'])
168172

169-
# timestamp is utc by default
170-
df['timestamp'] = df['timestamp'].apply(
171-
lambda utc_dt:
172-
utc.localize(utc_dt, is_dst=None).astimezone(
173-
tzlocal.get_localzone()))
174-
175173
with self.output().open('w') as output_file:
176174
df.to_csv(output_file, index=False, header=True)
177175

0 commit comments

Comments
 (0)
Please sign in to comment.