word_pic.py

# -*- coding: utf-8 -*-

import codecs
from kelly import process_csv, get_svalex_info
from call_korp import call_korp

def collect_lex_items(lexical_resources, target_tags):
    """
    Collects items from the provided lexical resoures with the
    specified target POS tags. Duplicates are removed, POS tags
    are normalized to the same (Korp pipeline) tags  
    """
    tag_mapping = {"noun":"NN", "verb":"VB"}
    items = []
    for lex_res in lexical_resources.keys():
        for row in lexical_resources[lex_res]:
            for tag in target_tags[lex_res]:
                if lex_res == "kelly":
                    if row["Word classes"] == tag:
                        new_line = [row["Swedish items for translation"].split("(")[0].strip(" "),
                                    tag_mapping[tag], lex_res]
                        if new_line not in items:
                            items.append(new_line)
                elif lex_res == "svalex":
                    if row["tag"][:2] == tag:
                        new_line = [row["word"], tag, lex_res]
                        lemma_pos = [itm[:2] for itm in items]
                        if new_line[:2] not in lemma_pos:
                            items.append(new_line)
                        else:
                            dupl_ind = lemma_pos.index(new_line[:2])
                            if items[dupl_ind][2] != lex_res:
                                items[dupl_ind][2] += "," + lex_res
    result = ["\t".join(el) for el in items]
    return result

def save_lex_items(filename, location, content_to_save):
    with codecs.open(location + filename, "w", "utf-8") as f:
        f.write("\n".join(content_to_save))

def load_lex_items(filename, location):
    with codecs.open(location + filename, "r", "utf-8") as f:
        return f.readlines()

def add_f_content(filename, location, line):
    with codecs.open(location + filename, "a", "utf-8") as f:
        f.write(line)

def get_word_pic(query_word, corpora):
    clist = ','.join(corpora)
    return call_korp({"command": "relations",
                      "word" : query_word, #e.g. ge..vb.1, ta_upp..vbm.1
                      "type" : "lemgram",
                      "corpus":clist})

def save_word_pics(kelly_svalex, corpora, filename, location):
    """
    kelly_svalex: the list of items loaded from a file
    """
    result = {}
    add_f_content(filename, location, "lemma\tPOS\tsource\tdep_rel\trel_type\trel_lemma\trel_POS\tMI\n")
    for ll in kelly_svalex[8000:]: #use slices at a time
        lemma,pos,source = tuple(ll.split("\t"))
        if "_" in lemma:    #multiword expressions
            query_word = lemma + ".." + pos.lower() +"m.1"
        else:
            query_word = lemma + ".." + pos.lower() +".1"
        print "QUERY: ", query_word.encode("utf-8")
        try:
            word_pic_info = get_word_pic(query_word.encode("utf-8"), corpora)["relations"]
            for rel in word_pic_info:
                relation = rel["rel"]
                mi = rel["mi"]
                if mi > 50 and relation in ["SS", "OBJ", "AT"]:
                    if rel["dep"] == query_word:
                        rel_lemma = rel["head"]
                        rel_type = "has head"   #associated lemma is head of query word
                        rel_POS = rel["headpos"]
                    else:
                        rel_lemma = rel["dep"]
                        rel_type = "has dep"
                        rel_POS = rel["deppos"]
                    if relation == "OBJ":
                        relation = "OO" #also other obj relations?
                    rel_info = [lemma,pos,source.strip("\n"),relation,rel_type,rel_lemma,rel_POS,str(mi)]
                    #print rel_info
                    #filter dublicate information
                    if not result.has_key((rel_lemma,lemma,rel_type)):
                        result[(lemma,rel_lemma,rel_type)] = rel_info
                        rel_info_line = "\t".join(rel_info) + "\n"
                        #print rel_info_line.encode("utf-8")
                        add_f_content(filename, location, rel_info_line)

        except KeyError:
            pass
    return result

def load_word_pics(word_pics_file, location):
    """
    Output: (lemma, POS):  [{"lemma":"xxx", ... "rel_lemma":"yyy"}, {...}]
    """
    with codecs.open(location + word_pics_file, "r", "utf-8") as f:
        lines = f.readlines()
    word_pics = {}
    keys = lines[0].split("\t")
    for line in lines[1:]:
        line_els = line.split("\t")
        line_obj = {}
        for i,el in enumerate(line_els):
            line_obj[keys[i].strip("\n")] = el.strip("\n") #e.g. {"lemma":"vara", "pos":"VB", ... "MI": "123.45"}
        lemma,pos = line_els[0], line_els[1]
        if (lemma,pos) in word_pics:
            word_pics[(lemma,pos)].append(line_obj)
        else:
            word_pics[(lemma,pos)] = [line_obj]
    return word_pics

def get_mutual_info(token, all_tokens, stats, word_pictures):
    mi_score = 0.0
    used_rel_lemma = ""
    if token.pos in ["NN","VB"] and token.lemma:
        for lemma_pos, wps in word_pictures.items():
            if lemma_pos == (token.lemma[0],token.pos): #TO DO: check all lemmas not just [0]?
                for wp in wps:
                    is_lemgram = True
                    rel_item = wp["rel_lemma"].split(".")
                    if len(rel_item) < 2:       # wordforms (non lemmatized tokens): 'pågatåg'
                        is_lemgram = False
                    rel_lemma = wp["rel_lemma"] # lemgram ('lex') e.g. roll..n.1
                    
                    #token as dependent
                    if wp["rel_type"] == "has head" and wp["dep_rel"] == token.deprel:
                        head = all_tokens[int(token.depheadid)-1]
                        if is_lemgram and head.lex:
                            if rel_lemma == head.lex[0] and (rel_item[0],wp["rel_POS"]) not in stats["used_rel_lemmas"]: #and wp["rel_POS"] == head.pos
                                #print rel_lemma.encode("utf-8") + " (dep - l)"
                                mi_score = float(wp["MI"])
                                used_rel_lemma = (rel_item[0], wp["rel_POS"])
                            elif head.suffix:
                                if rel_lemma == head.suffix[0] and (rel_item[0],wp["rel_POS"]) not in stats["used_rel_lemmas"]: #and wp["rel_POS"] == head.pos
                                    #print rel_lemma.encode("utf-8") + " (dep - l - suffix)"
                                    mi_score = float(wp["MI"])
                                    used_rel_lemma = (rel_item[0], wp["rel_POS"])
                        elif rel_lemma == head.word and wp["rel_POS"] == head.pos:
                                #print rel_lemma.encode("utf-8") + " (dep - wf)"
                                mi_score = float(wp["MI"])
                    
                    #token as head
                    if wp["rel_type"] == "has dep":
                        if "heads" in stats:
                            for h, deps in stats["heads"].items(): #dict not list {"head_ref": [list of child nodes]}
                                if h == token.ref:
                                    for d in deps:
                                        if d.deprel == wp["dep_rel"]:
                                            if is_lemgram and d.lemma and d.lex:
                                                #print d.lemma[0].encode("utf-8"), d.deprel, d.pos, rel_lemma.encode("utf-8")
                                                if d.lex[0] == rel_lemma and (rel_item[0],wp["rel_POS"]) not in stats["used_rel_lemmas"]:
                                                    mi_score = float(wp["MI"])
                                                    used_rel_lemma = (rel_item[0], wp["rel_POS"])
                                                    #print "\t", rel_lemma.encode("utf-8") + " (head - l)"
                                                elif d.suffix:
                                                    #print d.suffix
                                                    if d.suffix[0] == rel_lemma and (rel_item[0],wp["rel_POS"]) not in stats["used_rel_lemmas"]:
                                                        #print "\t", rel_lemma.encode("utf-8") + " (head - l - suffix)"
                                                        mi_score = float(wp["MI"])
                                                        used_rel_lemma = (rel_item[0], wp["rel_POS"])
                                            elif d.word == rel_lemma and d.pos == wp["rel_POS"]:
                                                mi_score = float(wp["MI"])
                                                used_rel_lemma = (rel_lemma, wp["POS"])
                                                #print rel_lemma.encode("utf-8") + " (head - wf)"

    return (mi_score,used_rel_lemma) #TO DO: check why MI for different senses are still repeated


#------------ function calls --------------------

# #1. Create Kelly-SVALex list of nouns and verbs 
# kelly = process_csv("/media/phd/DEVELOPMENT/rdby_exp/scripts/kelly_sv.csv")
# svalex = process_csv("/media/phd/DEVELOPMENT/rdby_exp/scripts/SVALex_final.csv")
# lexical_resources = {"kelly": kelly, "svalex": svalex}
# target_tags = {"kelly": ["noun", "verb"], "svalex": ["NN", "VB"]}
# #l = collect_lex_items(lexical_resources, target_tags)
filename = "kelly_svalex_NN_VB.txt"
location = "/media/phd/DEVELOPMENT/rdby_exp/scripts/auxiliaries/"
# #save_lex_items(filename, location, l)

wp_file = "word_pics.csv"
# #add_f_content(wp_file, location, "YEYY")

# query_word = "ge..vb.1"
wp_corpora = ["rom99,bloggmix2014,gp2013,attasidor,lasbart,suc3,wikipedia-sv,talbanken"]
# aspects for selection: a variety of genres, manually annotated, recent for up-to-date language
# easy-to-read texts for finding more common patterns 
#wp = get_word_pic(query_word, corpora)
#print wp["relations"][0]["head"]

##kelly_svalex = load_lex_items(filename, location)
#for l in kelly_svalex[:10]:
#   print l.encode("utf-8")
### r = save_word_pics(kelly_svalex,wp_corpora, wp_file, location)

#wps = load_word_pics(wp_file, location)
#for k,v in wps.items():
#    if k[0] == "vara":
#        print k[0].encode("utf-8"), v