Skip to content

Latest commit

 

History

History
479 lines (405 loc) · 18.2 KB

aea.org

File metadata and controls

479 lines (405 loc) · 18.2 KB

BeautifulSoup

it seems that the structure of the file is such that the structure looks something like

  • font class=”dayHeader” t
  • font class=”sessionTime” t
  • font class=”sessionLocation” t
  • font class=”sessionSource” t
  • div class=”sessionTitle” t
  • div class=”presiding”
    • font class=”name” t
    • font class=”affiliation” t
  • div class=”paper”
    • font class=”paperTitle” t
    • div class=”author”
      • font class=”name” t
      • font class=”affiliation” t
    • div id=paper_NNNNN_abstract ttt

L1: find first dayHeader L3: get next sibling is dayHeader? yes, go to L1 is not sessionTime? error E1 L2: collect sessionLocation, sessionSource, sessionTitle, presiding get next sibling is dayHeader? yes, go to L1 is sessionTime? yes, go to L2 is not paper? error E2 collect paperTitle collect authors collect abstract go to L3

for layout:

  • dayHeader: **
  • sessionTime: ***
  • session (Time, Location, Source, Title, presiding, panelist}: **** so, duplicate sessionTime, to get info closer to reader
  • each paper (Title, author): *****
  • abstract (hyperlink): ******

when get, e.g., new sessionTime, check to see if same as current. if same, don’t emit anything; else, emit a new “** Session time: %”

but, alternative doc would have “Source” at second level.

tagsectionpremarkerpostmarkerbeforechildafterchild
dayHeaderday**\n
sessionTimetime***\n
sessionTitlesession****\n
topicsession*****\n
sessionJELsession- JEL:\n
sessionSourcesession-\n
sessionTypesession\n
sessionLocationsession-\n
presidingsession- presiding:\n
panelistsession- panelist:\n
paperpaper\n
paperTitlepaper*****\n
authorpaper- author(s):\n
abstractpaper- abstract:\n\n\n
nameaffil-\n
affiliation\n
discussantsSectionpostsession*****\n
discussantpostsession\n

problem with HTML: need to surround children with, e.g., <ul>…</ul>

tagsectionpremarkerpostmarkerbeforechildafterchild
dayHeaderday<li><input type=”checkbox” id=”dayHeader%{d}”><label for=”dayHeader%{d}”></label>\n<ul></ul></li>
sessionTimetime<li><input type=”checkbox” id=”sessionTime%{d}”><label for=”sessionTime%{d}”></label>\n<ul></ul></li>
sessionTitlesession<li><input type=”checkbox” id=”sessionTitle%{d}”><label for=”sessionTitle%{d}”></label>\n<ul></ul></li>
topicsession<li></li>
sessionJELsession<li> JEL:</li>
sessionSourcesession<li></li>
sessionTypesession<li></li>
sessionLocationsession<li></li>\n
presidingsession<li> presiding:</li>\n
panelistsession<li> panelist:</li>\n
paperpaper
paperTitlepaper<li><input type=”checkbox” id=”paper%{d}”><label for=”paper%{d}”></label>\n<ul></ul></li>
authorpaper<li></li>\n
abstractpaper<li>abstract\n<ul><li></li></ul></li>
nameaffil<li></li>\n
affiliation<li></li>\n
discussantsSectionpostsession<li></li>\n<ul></ul>
discussantpostsession
import re
import textwrap

def semantic_init():
    global cur_day, cur_time, session, papers, postsession, abstractre
    cur_day = ""
    cur_time = ""
    session = {}
    papers = []
    postsession = []
    abstractre = re.compile("paper_[0-9]*_abstract")

def begin_day():
    end_parent('dayHeader')

def in_day(cur):
    global cur_day
    if cur_day != rstring(cur):
        cur_day = rstring(cur)
        withmarkers('dayHeader', cur_day)

def end_day():
    begin_parent('dayHeader')

def begin_time():
    pass

def in_time(cur):
    global cur_time
    if cur_time != rstring(cur):
        cur_time = rstring(cur)
        end_parent('sessionTime')
        withmarkers('sessionTime', cur_time)
        begin_parent('sessionTime')

def end_time():
    pass

# session.format = 
# "**** $sessionTitle
# - $sessionJEL
# - $sessionSource
# - $sessionType
# - $sessionLocation
# - presiding: $presiding
# [- $panelist]*\n"

def begin_session():
    global session, papers
    session = {}
    papers = []                 # new session, new set of papers
    end_parent('sessionTitle')

def in_session(cur):
    session[rclass(cur)] = cur

def end_session():
    if 'sessionTitle' in session:
        withmarkers('sessionTitle', session['sessionTitle'].contents[0].strip())
    else:
        withmarkers('sessionTitle', "<unnamed session>")
    begin_parent('sessionTitle')
    if firstchild(session['sessionTitle']) :
        withmarkers('sessionJEL', firstchild(session['sessionTitle']).string.strip())
    withmarkers('sessionSource', session['sessionSource'].string.strip())
    if 'sessionType' in session:
        withmarkers('sessionType', session['sessionType'].string.strip())
    withmarkers('sessionLocation', re.sub("^,", "", session['sessionLocation'].string.strip()))
    if 'presiding' in session:
        withmarkers('presiding', nameaffils(session['presiding']))
    if 'panelist' in session:
        withmarkers('panelist', nameaffils(session['panelist']))

# paper.format =
# "***** $paperTitle
# - $name ($affiliation)
# ****** abstract
# $abstract

def begin_paper():
    pass

def in_paper(cur):
    global papers
    papers += [cur]

def end_paper():
    for paper in papers:
        title = paper.find(class_="paperTitle").string.strip()
        withmarkers('paperTitle', title)
        begin_parent('paperTitle')
        withmarkers('nameaffil', nameaffils(paper)) # authors
        if paper.find(id=abstractre):
            output(premarker('abstract'))
            output(postmarker('abstract'))
            begin_parent('abstract')
            abstract = textwrap.fill(paper.find(id=abstractre).string.replace("\n", "").strip())
            output(abstract)
            end_parent('abstract')
        end_parent('paperTitle')


def begin_postsession():
    global postsession
    postsession = []

def in_postsession(cur):
    global postsession
    postsession = postsession + [cur]

def end_postsession():
    global postsession
    if "postsession" in globals():
        begin_parent('discussantsSection')
        for dsection in postsession:
            withmarkers('discussantsSection', "discussant: %s" %
                        nameaffils(dsection)) # discussants
        end_parent('discussantsSection')

# some semantic-aware utility routines

def nameaffils(curl, separator=", "): # XXX descend to get names and affiliations
    result = ""
    cursep = ""
    # to allow ResultSet to work, make *everything* a list
    if type(curl).__name__ != 'ResultSet':
        curl = [curl]
    for cur in curl:
        for name, affil in zip(cur.findAll(class_="name"),
                               cur.findAll(class_="affiliation")):
            result = result + cursep + name.string.strip() + " " + affil.string.strip()
            cursep = separator
    return result


def premarker(sect):
    return sections[sect][s_premarker]

def postmarker(sect):
    return sections[sect][s_postmarker]

def withmarkers(sect, str):
    output("%s %s %s" % (premarker(sect), str, postmarker(sect)))

# paradoxically, we call begin at end, end at begin...
def end_parent(tag):
    global parents
    if tag in parents:
        tail = parents.pop()
        while tail != tag:      # grab
            output(sections[tail][s_afterchild])
            tail = parents.pop()
        output(sections[tag][s_afterchild])

def begin_parent(tag):
    global parents
    output(sections[tag][s_beforechild])
    parents.append(tag)


# this is the non-semantic part of our process

def output(outstr):
    global outf, outcount
    outstr = outstr.replace("%{d}", str(outcount))
    outstr = outstr.replace("\\n", "\n")
    outcount += 1
    outf.write(outstr.encode("utf-8"))

def navigablestring(cur):
    return type(cur).__name__ == "NavigableString"


def rstring(cur):
    try:
        if navigablestring(cur.contents[0]) & (len(cur.contents) == 1):
            return cur.string.strip()
        else:
            return ""
    # http://stackoverflow.com/a/730778
    except Exception:
        return ""

def rclass(cur):
    try:
        return cur['class'][0]
    except Exception:
        return ""

def nextsib(cur, count=1):
    x = cur.next_sibling;
    while type(x).__name__ == "NavigableString":
        x = x.next_sibling
    if count <= 1:
        return x
    else:
        return nextsib(x, count-1)

def firstchild(cur):
    try:
        child = cur.contents[0]
        if type(child).__name__ == "NavigableString":
            return nextsib(child)
        else:
            return child
    except Exception:
        pass

def listtodict(l):
    a = {}
    for i in l:
        a[i[0]] = i[1:]
    return a

def walk(me, outfile, reset=True):
    global lastsection, section, lastme, outf
    if reset:
        lastsection = ""
        semantic_init()
        outf = open(outfile, "w")
        # https://docs.python.org/2/howto/unicode.html
    while me:
        lastme = me
        # print "%s:  %s" % (rclass(me), rstring(me))
        class_ = rclass(me)
        if class_ != '':
            section = sections[class_][s_section]
            if section == "":
                section = lastsection
            if section != lastsection: # changing section
                if lastsection != "":
                    eval("end_%s()" % lastsection) # end the previous section
                lastsection = section
                eval("begin_%s()" % section)       # start the new section
            eval("in_%s(me)" % section)
            # print "%s:  %s" % (class_, rstring(me))
            me = nextsib(me)        # continue this level

def walkdown(parents, outfile):
    first = True
    for one in parents:
        walk(firstchild(one), outfile, reset=first)
        first = False

def runone(sects, outfile):
    global sections, outcount, parents
    sections = listtodict(sects)
    sections[''] = ['']
    outcount = 0
    # http://stackoverflow.com/a/4688885
    # https://docs.python.org/2/tutorial/datastructures.html
    parents = []
    walkdown(soup.findAll(id=re.compile("group_div.*")), outfile)

def soupson(fname):
    global soup
    from bs4 import BeautifulSoup
    # http://stackoverflow.com/questions/11339955/python-string-encode-decode
    html = open(fname, "r").read()
    # need to get rid of <hr>, <br> (mess up beautifulsoup)
    # http://stackoverflow.com/questions/17639031/beautifulsoup-sibling-structure-with-br-tags
    # and, <strong>, <em>, seem to get in our way (by making cur.string =
    # "", needing to descend
    # XXX -- should be some more general way of doing this!
    p = re.compile("<br>|<br />|<hr>|<hr />|<strong>|</strong>|<em>|</em>|<input type='hidden' value='[0-9]*' name='div_contents\[\]' />")
    html = p.sub("", html)
    for i in ["<br>", "<br />", "<hr>", "<hr />", "<strong>", "</strong>", "<em>", "</em>"]:
        html = html.replace(i, "")
    # from
    # http://www.crummy.com/software/BeautifulSoup/bs4/doc/
    soup = BeautifulSoup(html, 'html.parser', from_encoding="utf-8")

s_section = 0
s_premarker = 1
s_postmarker = 2
s_beforechild = 3
s_afterchild = 4

soupson(fname)

runone(htmlsections, htmloutfile)
runone(orgsections, orgoutfile)
# walk(soup.find(class_="dayHeader")) # *old* style
# walkdown(soup.findAll(id=re.compile("group_div.*")))

from bs4 import BeautifulSoup
# http://stackoverflow.com/questions/11339955/python-string-encode-decode
html = open(fname, "r").read()

# need to get rid of <hr>, <br> (mess up beautifulsoup)
# http://stackoverflow.com/questions/17639031/beautifulsoup-sibling-structure-with-br-tags

# and, <strong>, <em>, seem to get in our way (by making cur.string =
# "", needing to descend

# XXX -- should be some more general way of doing this!
for i in ["<br>", "<hr>", "<strong>", "</strong>", "<em>", "</em>"]:
    html = html.replace(i, "")

# from
# http://www.crummy.com/software/BeautifulSoup/bs4/doc/
soup = BeautifulSoup(html, 'html.parser', from_encoding="utf-8")
print("done")

sessiontimes = list(set(soup.find_all('font', "sessionTime"))).sort()
# http://stackoverflow.com/questions/19460403/html-file-parsing-in-python
from bs4 import BeautifulSoup
from pprint import pprint

soup = BeautifulSoup(html)
h2s = soup.select("h2") #get all h2 elements
tables = soup.select("table") #get all tables

first = True
title =""
players = []
for i,table in enumerate(tables):
    if first:
         #every h2 element has 2 tables. table size = 8, h2 size = 4
         #so for every 2 tables 1 h2
         title =  h2s[int(i/2)].text
    for tr in table.select("tr"):
        player = (title,) #create a player
        for td in tr.select("td"):
            player = player + (td.text,) #add td info in the player
        if len(player) > 1: 
            #If the tr contains a player and its not only ("Goalkeaper") add it
            players.append(player)
    first = not first
pprint(players)
# https://bytes.com/topic/python/answers/684389-removing-certain-tags-html-files
from BeautifulSoup import BeautifulSoup
def remove(soup, tagname):
    for tag in soup.findAll(tagname):
        contents = tag.contents
        parent = tag.parent
        tag.extract()
        for tag in contents:
            parent.append(tag)

def main():
    source = '<a><b>This is a <c>Test</c></b></a>'
    soup = BeautifulSoup(source)
    print soup
    remove(soup, 'b')
    print soup

css collapsible lists