From 4bab5960c92610d40124ce1d388394becdbde486 Mon Sep 17 00:00:00 2001 From: chriskamphuis Date: Mon, 24 Jun 2019 21:04:00 +0200 Subject: [PATCH] Better topic reader --- src/main/python/topic_reader.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/main/python/topic_reader.py b/src/main/python/topic_reader.py index 9e6536e..b4871a8 100755 --- a/src/main/python/topic_reader.py +++ b/src/main/python/topic_reader.py @@ -9,7 +9,9 @@ def __init__(self, topics_file_name): self.filename = topics_file_name self.file = open(self.filename) self.topics = [] + print("Read topics...",flush=True) self._read_topics_file() + print("Preprocess topics...",flush=True) self._preprocess_titles() def _read_topics_file(self): @@ -17,36 +19,54 @@ def _read_topics_file(self): line = self.file.readline() if not line: break + + if not line.strip(): + continue - while line.strip() != '': + while line and not line.startswith(''): line = self.file.readline() + if not line: + break while not line.startswith(''): line = self.file.readline() - topic_no = int(re.search('\d+$', line.strip()).group(0)) + topic_no = int(re.search('Number: (\d+)', line.strip()).group(1)) + # print("Parsing topic {}".format(topic_no),flush=True) + while not line.startswith(''): line = self.file.readline() + + # Robust04 specific: topic_title = line.strip()[8:] - + line = self.file.readline().strip() - while not line.startswith('<desc>'): + while not line.startswith('') and not line.startswith(''): topic_title += line line = self.file.readline().strip() + while not line.startswith(''): + line = self.file.readline().strip() + topic_desc = "" line = self.file.readline().strip() - while not line.startswith(''): + while not line.startswith('') and not line.startswith(''): topic_desc += line line = self.file.readline().strip() + + while not line.startswith(''): + line = self.file.readline().strip() topic_nar = "" line = self.file.readline().strip() - while line != '': + while not line.startswith('') and not line.startswith(''): topic_nar += line line = self.file.readline().strip() + while not line.startswith(''): + line = self.file.readline().strip() + topic = { 'number' : topic_no, 'title' : topic_title,