This repository has been archived by the owner on Jan 7, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathqueue-stories-from-db.py
53 lines (43 loc) · 2.16 KB
/
queue-stories-from-db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import logging
import sys
from quoteworker import get_db_client
from quoteworker.tasks import parse_quotes_to_db
logger = logging.getLogger(__name__)
# how many stories do you want to queue (set this to ~4 for testing, then a number more than your total to queue them all)
BATCH_SIZE = 200000
# we have timeouts with long stories, so split them if longer than this char max
MAX_CHAR_LEN = 10000
# the property of the Mongo document that will be used as the text to check for quotes
TEXT_PROP = 'story_text'
collection = get_db_client()
# how many left to do?
total = collection.count_documents({TEXT_PROP: {'$exists': True}})
unprocessed = collection.count_documents({'annotatedWithQuotes': {'$exists': False}, TEXT_PROP: {'$exists': True}})
logger.info("Stats:")
logger.info(" {} total".format(total))
logger.info(" {} have quotes".format(total - unprocessed))
logger.info(" {} need quotes".format(unprocessed))
#sys.exit()
# get stories with text without quotes from DB
logger.info("Fetching...")
queued = 0
chunk_count = 0
results = collection.find({'annotatedWithQuotes': {'$exists': False}, TEXT_PROP: {'$exists': True}}).limit(BATCH_SIZE)
for story in results:
logger.debug("Story {}".format(story['stories_id']))
if (TEXT_PROP in story) and (story[TEXT_PROP] is not None): # only process if it has text
# break longer stories into smaller chunks so we don't hit a CoreNLP timeout
if len(story[TEXT_PROP]) > MAX_CHAR_LEN:
chunks = [story[TEXT_PROP][i:i + MAX_CHAR_LEN] for i in range(0, len(story[TEXT_PROP]), MAX_CHAR_LEN)]
else:
chunks = [story[TEXT_PROP]]
# queue up the job(s) to save extracted quote text to the DB
for c in chunks:
job = {'stories_id': story['stories_id'], 'text': c, 'add_to_quotes': True}
parse_quotes_to_db.delay(job)
# logger.info(" queueing up {} ({} chunks)".format(story['stories_id'], len(chunks)))
chunk_count += len(chunks)
queued += 1
else:
logger.warning(" Story {} has no text - skipping".format(story['stories_id']))
logger.info("Queued {} stories (in {} < {} char chunks)".format(queued, chunk_count, MAX_CHAR_LEN))