Skip to content

Commit

Permalink
Better topic reader
Browse files Browse the repository at this point in the history
  • Loading branch information
chriskamphuis committed Jun 24, 2019
1 parent 7b569fd commit 4bab596
Showing 1 changed file with 26 additions and 6 deletions.
32 changes: 26 additions & 6 deletions src/main/python/topic_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,44 +9,64 @@ def __init__(self, topics_file_name):
self.filename = topics_file_name
self.file = open(self.filename)
self.topics = []
print("Read topics...",flush=True)
self._read_topics_file()
print("Preprocess topics...",flush=True)
self._preprocess_titles()

def _read_topics_file(self):
while True:
line = self.file.readline()
if not line:
break

if not line.strip():
continue

while line.strip() != '<top>':
while line and not line.startswith('<top>'):
line = self.file.readline()
if not line:
break

while not line.startswith('<num>'):
line = self.file.readline()
topic_no = int(re.search('\d+$', line.strip()).group(0))
topic_no = int(re.search('Number: (\d+)', line.strip()).group(1))

# print("Parsing topic {}".format(topic_no),flush=True)

while not line.startswith('<title>'):
line = self.file.readline()

# Robust04 specific:
topic_title = line.strip()[8:]

line = self.file.readline().strip()
while not line.startswith('<desc>'):
while not line.startswith('</title>') and not line.startswith('<desc>'):
topic_title += line
line = self.file.readline().strip()

while not line.startswith('<desc>'):
line = self.file.readline().strip()

topic_desc = ""
line = self.file.readline().strip()
while not line.startswith('<narr>'):
while not line.startswith('</desc>') and not line.startswith('<narr>'):
topic_desc += line
line = self.file.readline().strip()

while not line.startswith('<narr>'):
line = self.file.readline().strip()

topic_nar = ""
line = self.file.readline().strip()

while line != '</top>':
while not line.startswith('</narr>') and not line.startswith('</top>'):
topic_nar += line
line = self.file.readline().strip()

while not line.startswith('</top>'):
line = self.file.readline().strip()

topic = {
'number' : topic_no,
'title' : topic_title,
Expand Down

0 comments on commit 4bab596

Please sign in to comment.