1
1
"""Provides tasks for downloading tweets related to the museum."""
2
2
3
3
import datetime as dt
4
+ import dateutil
4
5
from pytz import utc
5
6
import re
6
7
7
8
import luigi
8
9
from luigi .format import UTF8
9
10
import pandas as pd
10
- import twitterscraper as ts
11
+ import twint
11
12
import tzlocal
12
13
13
14
from _utils import CsvToDb , DataPreparationTask , MuseumFacts , logger
@@ -114,7 +115,7 @@ def output(self):
114
115
115
116
116
117
class FetchTwitter (DataPreparationTask ):
117
- """Fetch tweets related to the museum using the twitterscraper ."""
118
+ """Fetch tweets related to the museum using twint ."""
118
119
119
120
query = luigi .Parameter (default = "museumbarberini" )
120
121
timespan = luigi .parameter .TimeDeltaParameter (
@@ -132,12 +133,30 @@ def run(self):
132
133
if self .minimal_mode :
133
134
timespan = dt .timedelta (days = 5 )
134
135
135
- tweets = ts .query_tweets (
136
- self .query ,
137
- begindate = dt .date .today () - timespan ,
138
- enddate = dt .date .today () + dt .timedelta (days = 1 ))
136
+ tweets : twint .tweet .tweet = []
137
+ twint .run .Search (twint .Config (
138
+ Search = self .query ,
139
+ Since = str (dt .date .today () - timespan ),
140
+ Until = str (dt .date .today () + dt .timedelta (days = 1 )),
141
+ Limit = 10000 ,
142
+ Store_object = True ,
143
+ Store_object_tweets_list = tweets ,
144
+ Hide_output = True
145
+ ))
139
146
if tweets :
140
- df = pd .DataFrame ([tweet .__dict__ for tweet in tweets ])
147
+ df = pd .DataFrame ([
148
+ dict (
149
+ user_id = tweet .user_id ,
150
+ tweet_id = tweet .id ,
151
+ text = tweet .tweet ,
152
+ parent_tweet_id = None , # TODO: Nuke
153
+ timestamp = dateutil .parser .parse (tweet .datetime ),
154
+ likes = tweet .likes_count ,
155
+ retweets = tweet .retweets_count ,
156
+ replies = tweet .replies_count
157
+ )
158
+ for tweet in tweets
159
+ ])
141
160
else : # no tweets returned, ensure schema
142
161
df = pd .DataFrame (columns = [
143
162
'user_id' ,
@@ -149,29 +168,8 @@ def run(self):
149
168
'retweets' ,
150
169
'replies' ])
151
170
152
- # Filter out false positive matches. This is oviously a workaround,
153
- # but at the moment cheaper than repairing or switching the scraper.
154
- # See #352.
155
- is_false_positive = ~ (
156
- df ['parent_tweet_id' ].apply (bool )
157
- | df ['text' ].str .contains (self .query , flags = re .IGNORECASE )
158
- | df ['screen_name' ].str .contains (self .query , flags = re .IGNORECASE ))
159
- if is_false_positive .any ():
160
- false_positives = df [is_false_positive ]
161
- logger .warning (
162
- f"Dropping { len (false_positives )} tweets that are not "
163
- f"related to the query"
164
- )
165
- df = df [~ is_false_positive ]
166
-
167
171
df = df .drop_duplicates (subset = ['tweet_id' ])
168
172
169
- # timestamp is utc by default
170
- df ['timestamp' ] = df ['timestamp' ].apply (
171
- lambda utc_dt :
172
- utc .localize (utc_dt , is_dst = None ).astimezone (
173
- tzlocal .get_localzone ()))
174
-
175
173
with self .output ().open ('w' ) as output_file :
176
174
df .to_csv (output_file , index = False , header = True )
177
175
0 commit comments