Skip to content

Commit

Permalink
Fixed error reporting in spider
Browse files Browse the repository at this point in the history
  • Loading branch information
minimalparts committed Feb 14, 2024
1 parent ab72cc3 commit b593b3e
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions app/indexer/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,13 @@ def omd_parse(current_url):
fout = open(join(dir_path,'docs_to_index.txt'),'a')
try:
xml = requests.get(current_url, timeout=10, headers={'Authorization': AUTH_TOKEN}, stream =True).raw
except:
print(">> ERROR: SPIDER: OMD PARSE: Request failed. Moving on.")
return links
try:
parse = xmltodict.parse(xml.read())
except:
print("Request failed. Moving on.")
print(">> ERROR: SPIDER: OMD PARSE: File may have some bad XML. Could not parse.")
return links
docs = parse['omd_index']['doc']
print("PARSE:",parse)
Expand Down Expand Up @@ -79,7 +83,7 @@ def omd_parse(current_url):
print("# DOC BODY:", body_str[:100])
fout.write("{{BODY}} "+body_str+"\n")
else:
print("# DOC BODY: Skipping request: content is neither text/plain nor text/html.")
print(">> ERROR: SPIDER: OMD PARSE: DOC BODY: Skipping request: content is neither text/plain nor text/html.")

fout.write("</doc>\n")
fout.close()
Expand Down Expand Up @@ -117,6 +121,6 @@ def write_docs(base_url):
#print("Found href:",link)
pages_to_visit.append(link)
except:
print(">> ERROR: Failed visiting current url!")
print(">> ERROR: SPIDER: OMD PARSE: Failed visiting current url!")


0 comments on commit b593b3e

Please sign in to comment.