it seems that the structure of the file is such that the structure looks something like
- font class=”dayHeader” t
- font class=”sessionTime” t
- font class=”sessionLocation” t
- font class=”sessionSource” t
- div class=”sessionTitle” t
- div class=”presiding”
- font class=”name” t
- font class=”affiliation” t
- div class=”paper”
- font class=”paperTitle” t
- div class=”author”
- font class=”name” t
- font class=”affiliation” t
- div id=paper_NNNNN_abstract ttt
L1: find first dayHeader L3: get next sibling is dayHeader? yes, go to L1 is not sessionTime? error E1 L2: collect sessionLocation, sessionSource, sessionTitle, presiding get next sibling is dayHeader? yes, go to L1 is sessionTime? yes, go to L2 is not paper? error E2 collect paperTitle collect authors collect abstract go to L3
for layout:
- dayHeader: **
- sessionTime: ***
- session (Time, Location, Source, Title, presiding, panelist}: **** so, duplicate sessionTime, to get info closer to reader
- each paper (Title, author): *****
- abstract (hyperlink): ******
when get, e.g., new sessionTime, check to see if same as current. if same, don’t emit anything; else, emit a new “** Session time: %”
but, alternative doc would have “Source” at second level.
tag | section | premarker | postmarker | beforechild | afterchild |
---|---|---|---|---|---|
dayHeader | day | ** | \n | ||
sessionTime | time | *** | \n | ||
sessionTitle | session | **** | \n | ||
topic | session | ***** | \n | ||
sessionJEL | session | - JEL: | \n | ||
sessionSource | session | - | \n | ||
sessionType | session | \n | |||
sessionLocation | session | - | \n | ||
presiding | session | - presiding: | \n | ||
panelist | session | - panelist: | \n | ||
paper | paper | \n | |||
paperTitle | paper | ***** | \n | ||
author | paper | - author(s): | \n | ||
abstract | paper | - abstract:\n | \n | \n | |
nameaffil | - | \n | |||
affiliation | \n | ||||
discussantsSection | postsession | ***** | \n | ||
discussant | postsession | \n |
problem with HTML: need to surround children with, e.g., <ul>…</ul>
tag | section | premarker | postmarker | beforechild | afterchild |
---|---|---|---|---|---|
dayHeader | day | <li><input type=”checkbox” id=”dayHeader%{d}”><label for=”dayHeader%{d}”> | </label>\n | <ul> | </ul></li> |
sessionTime | time | <li><input type=”checkbox” id=”sessionTime%{d}”><label for=”sessionTime%{d}”> | </label>\n | <ul> | </ul></li> |
sessionTitle | session | <li><input type=”checkbox” id=”sessionTitle%{d}”><label for=”sessionTitle%{d}”> | </label>\n | <ul> | </ul></li> |
topic | session | <li> | </li> | ||
sessionJEL | session | <li> JEL: | </li> | ||
sessionSource | session | <li> | </li> | ||
sessionType | session | <li> | </li> | ||
sessionLocation | session | <li> | </li>\n | ||
presiding | session | <li> presiding: | </li>\n | ||
panelist | session | <li> panelist: | </li>\n | ||
paper | paper | ||||
paperTitle | paper | <li><input type=”checkbox” id=”paper%{d}”><label for=”paper%{d}”> | </label>\n | <ul> | </ul></li> |
author | paper | <li> | </li>\n | ||
abstract | paper | <li>abstract | \n | <ul><li> | </li></ul></li> |
nameaffil | <li> | </li>\n | |||
affiliation | <li> | </li>\n | |||
discussantsSection | postsession | <li> | </li>\n | <ul> | </ul> |
discussant | postsession |
import re
import textwrap
def semantic_init():
global cur_day, cur_time, session, papers, postsession, abstractre
cur_day = ""
cur_time = ""
session = {}
papers = []
postsession = []
abstractre = re.compile("paper_[0-9]*_abstract")
def begin_day():
end_parent('dayHeader')
def in_day(cur):
global cur_day
if cur_day != rstring(cur):
cur_day = rstring(cur)
withmarkers('dayHeader', cur_day)
def end_day():
begin_parent('dayHeader')
def begin_time():
pass
def in_time(cur):
global cur_time
if cur_time != rstring(cur):
cur_time = rstring(cur)
end_parent('sessionTime')
withmarkers('sessionTime', cur_time)
begin_parent('sessionTime')
def end_time():
pass
# session.format =
# "**** $sessionTitle
# - $sessionJEL
# - $sessionSource
# - $sessionType
# - $sessionLocation
# - presiding: $presiding
# [- $panelist]*\n"
def begin_session():
global session, papers
session = {}
papers = [] # new session, new set of papers
end_parent('sessionTitle')
def in_session(cur):
session[rclass(cur)] = cur
def end_session():
if 'sessionTitle' in session:
withmarkers('sessionTitle', session['sessionTitle'].contents[0].strip())
else:
withmarkers('sessionTitle', "<unnamed session>")
begin_parent('sessionTitle')
if firstchild(session['sessionTitle']) :
withmarkers('sessionJEL', firstchild(session['sessionTitle']).string.strip())
withmarkers('sessionSource', session['sessionSource'].string.strip())
if 'sessionType' in session:
withmarkers('sessionType', session['sessionType'].string.strip())
withmarkers('sessionLocation', re.sub("^,", "", session['sessionLocation'].string.strip()))
if 'presiding' in session:
withmarkers('presiding', nameaffils(session['presiding']))
if 'panelist' in session:
withmarkers('panelist', nameaffils(session['panelist']))
# paper.format =
# "***** $paperTitle
# - $name ($affiliation)
# ****** abstract
# $abstract
def begin_paper():
pass
def in_paper(cur):
global papers
papers += [cur]
def end_paper():
for paper in papers:
title = paper.find(class_="paperTitle").string.strip()
withmarkers('paperTitle', title)
begin_parent('paperTitle')
withmarkers('nameaffil', nameaffils(paper)) # authors
if paper.find(id=abstractre):
output(premarker('abstract'))
output(postmarker('abstract'))
begin_parent('abstract')
abstract = textwrap.fill(paper.find(id=abstractre).string.replace("\n", "").strip())
output(abstract)
end_parent('abstract')
end_parent('paperTitle')
def begin_postsession():
global postsession
postsession = []
def in_postsession(cur):
global postsession
postsession = postsession + [cur]
def end_postsession():
global postsession
if "postsession" in globals():
begin_parent('discussantsSection')
for dsection in postsession:
withmarkers('discussantsSection', "discussant: %s" %
nameaffils(dsection)) # discussants
end_parent('discussantsSection')
# some semantic-aware utility routines
def nameaffils(curl, separator=", "): # XXX descend to get names and affiliations
result = ""
cursep = ""
# to allow ResultSet to work, make *everything* a list
if type(curl).__name__ != 'ResultSet':
curl = [curl]
for cur in curl:
for name, affil in zip(cur.findAll(class_="name"),
cur.findAll(class_="affiliation")):
result = result + cursep + name.string.strip() + " " + affil.string.strip()
cursep = separator
return result
def premarker(sect):
return sections[sect][s_premarker]
def postmarker(sect):
return sections[sect][s_postmarker]
def withmarkers(sect, str):
output("%s %s %s" % (premarker(sect), str, postmarker(sect)))
# paradoxically, we call begin at end, end at begin...
def end_parent(tag):
global parents
if tag in parents:
tail = parents.pop()
while tail != tag: # grab
output(sections[tail][s_afterchild])
tail = parents.pop()
output(sections[tag][s_afterchild])
def begin_parent(tag):
global parents
output(sections[tag][s_beforechild])
parents.append(tag)
# this is the non-semantic part of our process
def output(outstr):
global outf, outcount
outstr = outstr.replace("%{d}", str(outcount))
outstr = outstr.replace("\\n", "\n")
outcount += 1
outf.write(outstr.encode("utf-8"))
def navigablestring(cur):
return type(cur).__name__ == "NavigableString"
def rstring(cur):
try:
if navigablestring(cur.contents[0]) & (len(cur.contents) == 1):
return cur.string.strip()
else:
return ""
# http://stackoverflow.com/a/730778
except Exception:
return ""
def rclass(cur):
try:
return cur['class'][0]
except Exception:
return ""
def nextsib(cur, count=1):
x = cur.next_sibling;
while type(x).__name__ == "NavigableString":
x = x.next_sibling
if count <= 1:
return x
else:
return nextsib(x, count-1)
def firstchild(cur):
try:
child = cur.contents[0]
if type(child).__name__ == "NavigableString":
return nextsib(child)
else:
return child
except Exception:
pass
def listtodict(l):
a = {}
for i in l:
a[i[0]] = i[1:]
return a
def walk(me, outfile, reset=True):
global lastsection, section, lastme, outf
if reset:
lastsection = ""
semantic_init()
outf = open(outfile, "w")
# https://docs.python.org/2/howto/unicode.html
while me:
lastme = me
# print "%s: %s" % (rclass(me), rstring(me))
class_ = rclass(me)
if class_ != '':
section = sections[class_][s_section]
if section == "":
section = lastsection
if section != lastsection: # changing section
if lastsection != "":
eval("end_%s()" % lastsection) # end the previous section
lastsection = section
eval("begin_%s()" % section) # start the new section
eval("in_%s(me)" % section)
# print "%s: %s" % (class_, rstring(me))
me = nextsib(me) # continue this level
def walkdown(parents, outfile):
first = True
for one in parents:
walk(firstchild(one), outfile, reset=first)
first = False
def runone(sects, outfile):
global sections, outcount, parents
sections = listtodict(sects)
sections[''] = ['']
outcount = 0
# http://stackoverflow.com/a/4688885
# https://docs.python.org/2/tutorial/datastructures.html
parents = []
walkdown(soup.findAll(id=re.compile("group_div.*")), outfile)
def soupson(fname):
global soup
from bs4 import BeautifulSoup
# http://stackoverflow.com/questions/11339955/python-string-encode-decode
html = open(fname, "r").read()
# need to get rid of <hr>, <br> (mess up beautifulsoup)
# http://stackoverflow.com/questions/17639031/beautifulsoup-sibling-structure-with-br-tags
# and, <strong>, <em>, seem to get in our way (by making cur.string =
# "", needing to descend
# XXX -- should be some more general way of doing this!
p = re.compile("<br>|<br />|<hr>|<hr />|<strong>|</strong>|<em>|</em>|<input type='hidden' value='[0-9]*' name='div_contents\[\]' />")
html = p.sub("", html)
for i in ["<br>", "<br />", "<hr>", "<hr />", "<strong>", "</strong>", "<em>", "</em>"]:
html = html.replace(i, "")
# from
# http://www.crummy.com/software/BeautifulSoup/bs4/doc/
soup = BeautifulSoup(html, 'html.parser', from_encoding="utf-8")
s_section = 0
s_premarker = 1
s_postmarker = 2
s_beforechild = 3
s_afterchild = 4
soupson(fname)
runone(htmlsections, htmloutfile)
runone(orgsections, orgoutfile)
# walk(soup.find(class_="dayHeader")) # *old* style
# walkdown(soup.findAll(id=re.compile("group_div.*")))
from bs4 import BeautifulSoup
# http://stackoverflow.com/questions/11339955/python-string-encode-decode
html = open(fname, "r").read()
# need to get rid of <hr>, <br> (mess up beautifulsoup)
# http://stackoverflow.com/questions/17639031/beautifulsoup-sibling-structure-with-br-tags
# and, <strong>, <em>, seem to get in our way (by making cur.string =
# "", needing to descend
# XXX -- should be some more general way of doing this!
for i in ["<br>", "<hr>", "<strong>", "</strong>", "<em>", "</em>"]:
html = html.replace(i, "")
# from
# http://www.crummy.com/software/BeautifulSoup/bs4/doc/
soup = BeautifulSoup(html, 'html.parser', from_encoding="utf-8")
print("done")
sessiontimes = list(set(soup.find_all('font', "sessionTime"))).sort()
# http://stackoverflow.com/questions/19460403/html-file-parsing-in-python
from bs4 import BeautifulSoup
from pprint import pprint
soup = BeautifulSoup(html)
h2s = soup.select("h2") #get all h2 elements
tables = soup.select("table") #get all tables
first = True
title =""
players = []
for i,table in enumerate(tables):
if first:
#every h2 element has 2 tables. table size = 8, h2 size = 4
#so for every 2 tables 1 h2
title = h2s[int(i/2)].text
for tr in table.select("tr"):
player = (title,) #create a player
for td in tr.select("td"):
player = player + (td.text,) #add td info in the player
if len(player) > 1:
#If the tr contains a player and its not only ("Goalkeaper") add it
players.append(player)
first = not first
pprint(players)
# https://bytes.com/topic/python/answers/684389-removing-certain-tags-html-files
from BeautifulSoup import BeautifulSoup
def remove(soup, tagname):
for tag in soup.findAll(tagname):
contents = tag.contents
parent = tag.parent
tag.extract()
for tag in contents:
parent.append(tag)
def main():
source = '<a><b>This is a <c>Test</c></b></a>'
soup = BeautifulSoup(source)
print soup
remove(soup, 'b')
print soup