From ee896cf86c2f49622fb7117f6e4b05f888fbf590 Mon Sep 17 00:00:00 2001 From: "Leonel F. de Alencar" Date: Wed, 19 Feb 2020 16:56:44 -0300 Subject: [PATCH] issue #72 including Python module for the extraction of noun and adjective bases --- ExtractWordLemmaPairs.py | 207 +++++++++++++++++++++++++++++ tools/fst/ExtractWordLemmaPairs.py | 207 +++++++++++++++++++++++++++++ 2 files changed, 414 insertions(+) create mode 100755 ExtractWordLemmaPairs.py create mode 100755 tools/fst/ExtractWordLemmaPairs.py diff --git a/ExtractWordLemmaPairs.py b/ExtractWordLemmaPairs.py new file mode 100755 index 00000000..f80b6df0 --- /dev/null +++ b/ExtractWordLemmaPairs.py @@ -0,0 +1,207 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# Author: Leonel Figueiredo de Alencar +# leonel.de.alencar@ufc.br +# Date: April 20, 2018, updated February 18, 2020 + +"""This module is the first component of the architecture of a generator of Portuguese diminutives. It extracts possible diminutive formation bases from existing nouns and adjectives encoded in MorphoBr's format, as described in the following paper: + +ALENCAR, Leonel Figueiredo de; CUCONATO , Bruno; RADEMAKER, Alexandre. MorphoBr: an open source large-coverage full-form lexicon for morphological analysis of Portuguese. Texto Livre: Linguagem e Tecnologia, Belo Horizonte, v. 11, n. 3, p. 1-25, set.- dez. 2018. +ISSN 1983-3652 +DOI: 10.17851/1983-3652.11.3.1-25 +http://www.periodicos.letras.ufmg.br/index.php/textolivre/article/view/14294. + + +Unplausible bases are filtered out, see details below. The extracted bases are converted to spaced-text format and written to different files, according to the classification expected by the finite-state grammar in the morphotactic-grammar.lexc file. Some examples may help clarify the pipeline: + +Input in MorphoBr' format: + +agulhão agulha+N+AUG+M+SG +agulhões agulha+N+AUG+M+PL +agulhona agulha+N+AUG+F+SG +agulhonas agulha+N+AUG+F+PL + + +Output generated by this module (written to different files): + +a g u l h a +N +AUG +a g u l h ã o + +a g u l h a +N +AUG +a g u l h õ e s + +a g u l h a +N +AUG +a g u l h o n a + +a g u l h a +N +AUG +a g u l h o n a s + + +""" +import os, sys, re + +EXCLUDE_TAGS=["+DIM","+SUPER"] +EXTENSION=".stxt" + +"""Regex pattern matching itens that can not function as bases for +morphological derivations. This includes one or more consonants before a space +at the beginning of a line, for example: +b b+N+M+SG +c c+N+M+SG +d d+N+M+SG + +These itens are in fact abbreviations. As such, they cannot feed diminutive formation, +e.g. *bzinho 'little b' is ungrammatical (the correct form is 'bezinho', from 'bê', the name of +letter b). +The regex pattern also matches abbreviations such as 'ha' (for hectare) and chemical symbols ('Ba', 'Ca', etc.). +""" +CONS="[bcdfghjklmnpqrstvwxyz]" +ABB=re.compile(r"(?i)(%s{1,}|%s[aeo])\s" % (CONS,CONS)) + +aug_m_sg = open("aug_m_sg%s" % EXTENSION,"w") +aug_m_pl = open("aug_m_pl%s" % EXTENSION,"w") +aug_f_sg = open("aug_f_sg%s" % EXTENSION,"w") +aug_f_pl = open("aug_f_pl%s" % EXTENSION,"w") + +wdlm_in_s_m_sg = open("wdlm_in_s_m_sg%s" % EXTENSION,"w") +wdlm_in_s_m_pl = open("wdlm_in_s_m_pl%s" % EXTENSION,"w") +wdlm_in_s_f_sg = open("wdlm_in_s_f_sg%s" % EXTENSION,"w") +wdlm_in_s_f_pl = open("wdlm_in_s_f_pl%s" % EXTENSION,"w") + +masc_in_a_sg = open("masc_in_a_sg%s" % EXTENSION,"w") +fem_in_o_sg = open("fem_in_o_sg%s" % EXTENSION,"w") +masc_in_a_pl = open("masc_in_a_pl%s" % EXTENSION,"w") +fem_in_o_pl = open("fem_in_o_pl%s" % EXTENSION,"w") + +other_m_sg = open("other_m_sg%s" % EXTENSION,"w") +other_m_pl = open("other_m_pl%s" % EXTENSION,"w") +other_f_sg = open("other_f_sg%s" % EXTENSION,"w") +other_f_pl = open("other_f_pl%s" % EXTENSION,"w") + +def extract_entries(infile): + return [entry.strip().decode("utf-8") for entry in open(infile,"rU").readlines() if ignore_entry(entry.strip()) ] + +def split_entry(entry): + return re.split(r"\s+",entry) + +def exclude_abbr(entry): + if ABB.match(entry): + return True + return False + +def exclude_tag(entry): + for tag in EXCLUDE_TAGS: + if tag in entry: + return True + return False + +def ignore_entry(entry): + if entry == "" or exclude_tag(entry) or exclude_abbr(entry): + return False + else: + return True + +def space(word): + return " ".join(list(word)) + +def convert_entry(word,lemma,tags): + return "%s %s\n%s" % (space(lemma),"+%s" % " +".join(tags),space(word)) + +def parse_entry(entry): + word,parse=split_entry(entry) + lemma,tags=re.split(r"\+",parse,1) + return word,lemma,tags + +def WordLemmaInS(word,lemma): + if word.endswith("s") and lemma.endswith("s") and word == lemma: + return True + else: + return False + + +def NonCanonGendMarker(word,tags): + if ("-" in word and "+M+PL" in tags and word.endswith("a") + or "-" in word and "+F+PL" in tags and word.endswith("o") + or "+M+SG" in tags and word.endswith("a") + or "+M+PL" in tags and word.endswith("as") # N-N compounds like 'aços-liga' + or "+F+SG" in tags and word.endswith("o") + or "+F+PL" in tags and word.endswith("os") # N-N compounds like 'amostras-tipo' + ): + return True + else: + return False + +def write_entries(entries): + for entry in entries: + word,lemma,tags=parse_entry(entry) + if "+AUG" in tags: + stxt=convert_entry(word,lemma,re.split(r"\+",tags)[:2]).encode("utf-8") + if "+M+SG" in tags: + aug_m_sg.write("%s\n\n" % stxt) + elif "+M+PL" in tags: + aug_m_pl.write("%s\n\n" % stxt) + elif "+F+SG" in tags: + aug_f_sg.write("%s\n\n" % stxt) + else: + aug_f_pl.write("%s\n\n" % stxt) + + elif WordLemmaInS(word,lemma): # TODO: use re.split(r"\+",tags)[:-2], excluding gender and number tags, + # but including other tags besides the category tag (this may be useful in the future) + stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") + if "+M+SG" in tags: + wdlm_in_s_m_sg.write("%s\n\n" % stxt) + elif "+M+PL" in tags: + wdlm_in_s_m_pl.write("%s\n\n" % stxt) + elif "+F+SG" in tags: + wdlm_in_s_f_sg.write("%s\n\n" % stxt) + else: + wdlm_in_s_f_pl.write("%s\n\n" % stxt) + + elif NonCanonGendMarker(word,tags): # TODO: see the above comment + stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") + if "+M+SG" in tags: + masc_in_a_sg.write("%s\n\n" % stxt) + elif "+F+SG" in tags: + fem_in_o_sg.write("%s\n\n" % stxt) + #else: # discard plural forms + # this generates incorrect plurals of compounds like 'cebeça-chata' (23/01/2020) + #pass + elif "+F+PL" in tags: + fem_in_o_pl.write("%s\n\n" % stxt) + elif "+M+PL" in tags: + masc_in_a_pl.write("%s\n\n" % stxt) + else: + stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") # TODO: tags[:-2] (see above) + if "+M+SG" in tags: + other_m_sg.write("%s\n\n" % stxt) + elif "+M+PL" in tags: + other_m_pl.write("%s\n\n" % stxt) + elif "+F+SG" in tags: + other_f_sg.write("%s\n\n" % stxt) + else: + other_f_pl.write("%s\n\n" % stxt) + +def main(): + for infile in sys.argv[1:]: + entries=extract_entries(infile) + write_entries(entries) + aug_m_sg.close() + aug_m_pl.close() + aug_f_sg.close() + aug_f_pl.close() + wdlm_in_s_m_sg.close() + wdlm_in_s_m_pl.close() + wdlm_in_s_f_sg.close() + wdlm_in_s_f_pl.close() + masc_in_a_sg.close() + fem_in_o_sg.close() + masc_in_a_pl.close() + fem_in_o_pl.close() + other_m_sg.close() + other_m_pl.close() + other_f_sg.close() + other_f_pl.close() + +if __name__ == '__main__': + main() diff --git a/tools/fst/ExtractWordLemmaPairs.py b/tools/fst/ExtractWordLemmaPairs.py new file mode 100755 index 00000000..f80b6df0 --- /dev/null +++ b/tools/fst/ExtractWordLemmaPairs.py @@ -0,0 +1,207 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# Author: Leonel Figueiredo de Alencar +# leonel.de.alencar@ufc.br +# Date: April 20, 2018, updated February 18, 2020 + +"""This module is the first component of the architecture of a generator of Portuguese diminutives. It extracts possible diminutive formation bases from existing nouns and adjectives encoded in MorphoBr's format, as described in the following paper: + +ALENCAR, Leonel Figueiredo de; CUCONATO , Bruno; RADEMAKER, Alexandre. MorphoBr: an open source large-coverage full-form lexicon for morphological analysis of Portuguese. Texto Livre: Linguagem e Tecnologia, Belo Horizonte, v. 11, n. 3, p. 1-25, set.- dez. 2018. +ISSN 1983-3652 +DOI: 10.17851/1983-3652.11.3.1-25 +http://www.periodicos.letras.ufmg.br/index.php/textolivre/article/view/14294. + + +Unplausible bases are filtered out, see details below. The extracted bases are converted to spaced-text format and written to different files, according to the classification expected by the finite-state grammar in the morphotactic-grammar.lexc file. Some examples may help clarify the pipeline: + +Input in MorphoBr' format: + +agulhão agulha+N+AUG+M+SG +agulhões agulha+N+AUG+M+PL +agulhona agulha+N+AUG+F+SG +agulhonas agulha+N+AUG+F+PL + + +Output generated by this module (written to different files): + +a g u l h a +N +AUG +a g u l h ã o + +a g u l h a +N +AUG +a g u l h õ e s + +a g u l h a +N +AUG +a g u l h o n a + +a g u l h a +N +AUG +a g u l h o n a s + + +""" +import os, sys, re + +EXCLUDE_TAGS=["+DIM","+SUPER"] +EXTENSION=".stxt" + +"""Regex pattern matching itens that can not function as bases for +morphological derivations. This includes one or more consonants before a space +at the beginning of a line, for example: +b b+N+M+SG +c c+N+M+SG +d d+N+M+SG + +These itens are in fact abbreviations. As such, they cannot feed diminutive formation, +e.g. *bzinho 'little b' is ungrammatical (the correct form is 'bezinho', from 'bê', the name of +letter b). +The regex pattern also matches abbreviations such as 'ha' (for hectare) and chemical symbols ('Ba', 'Ca', etc.). +""" +CONS="[bcdfghjklmnpqrstvwxyz]" +ABB=re.compile(r"(?i)(%s{1,}|%s[aeo])\s" % (CONS,CONS)) + +aug_m_sg = open("aug_m_sg%s" % EXTENSION,"w") +aug_m_pl = open("aug_m_pl%s" % EXTENSION,"w") +aug_f_sg = open("aug_f_sg%s" % EXTENSION,"w") +aug_f_pl = open("aug_f_pl%s" % EXTENSION,"w") + +wdlm_in_s_m_sg = open("wdlm_in_s_m_sg%s" % EXTENSION,"w") +wdlm_in_s_m_pl = open("wdlm_in_s_m_pl%s" % EXTENSION,"w") +wdlm_in_s_f_sg = open("wdlm_in_s_f_sg%s" % EXTENSION,"w") +wdlm_in_s_f_pl = open("wdlm_in_s_f_pl%s" % EXTENSION,"w") + +masc_in_a_sg = open("masc_in_a_sg%s" % EXTENSION,"w") +fem_in_o_sg = open("fem_in_o_sg%s" % EXTENSION,"w") +masc_in_a_pl = open("masc_in_a_pl%s" % EXTENSION,"w") +fem_in_o_pl = open("fem_in_o_pl%s" % EXTENSION,"w") + +other_m_sg = open("other_m_sg%s" % EXTENSION,"w") +other_m_pl = open("other_m_pl%s" % EXTENSION,"w") +other_f_sg = open("other_f_sg%s" % EXTENSION,"w") +other_f_pl = open("other_f_pl%s" % EXTENSION,"w") + +def extract_entries(infile): + return [entry.strip().decode("utf-8") for entry in open(infile,"rU").readlines() if ignore_entry(entry.strip()) ] + +def split_entry(entry): + return re.split(r"\s+",entry) + +def exclude_abbr(entry): + if ABB.match(entry): + return True + return False + +def exclude_tag(entry): + for tag in EXCLUDE_TAGS: + if tag in entry: + return True + return False + +def ignore_entry(entry): + if entry == "" or exclude_tag(entry) or exclude_abbr(entry): + return False + else: + return True + +def space(word): + return " ".join(list(word)) + +def convert_entry(word,lemma,tags): + return "%s %s\n%s" % (space(lemma),"+%s" % " +".join(tags),space(word)) + +def parse_entry(entry): + word,parse=split_entry(entry) + lemma,tags=re.split(r"\+",parse,1) + return word,lemma,tags + +def WordLemmaInS(word,lemma): + if word.endswith("s") and lemma.endswith("s") and word == lemma: + return True + else: + return False + + +def NonCanonGendMarker(word,tags): + if ("-" in word and "+M+PL" in tags and word.endswith("a") + or "-" in word and "+F+PL" in tags and word.endswith("o") + or "+M+SG" in tags and word.endswith("a") + or "+M+PL" in tags and word.endswith("as") # N-N compounds like 'aços-liga' + or "+F+SG" in tags and word.endswith("o") + or "+F+PL" in tags and word.endswith("os") # N-N compounds like 'amostras-tipo' + ): + return True + else: + return False + +def write_entries(entries): + for entry in entries: + word,lemma,tags=parse_entry(entry) + if "+AUG" in tags: + stxt=convert_entry(word,lemma,re.split(r"\+",tags)[:2]).encode("utf-8") + if "+M+SG" in tags: + aug_m_sg.write("%s\n\n" % stxt) + elif "+M+PL" in tags: + aug_m_pl.write("%s\n\n" % stxt) + elif "+F+SG" in tags: + aug_f_sg.write("%s\n\n" % stxt) + else: + aug_f_pl.write("%s\n\n" % stxt) + + elif WordLemmaInS(word,lemma): # TODO: use re.split(r"\+",tags)[:-2], excluding gender and number tags, + # but including other tags besides the category tag (this may be useful in the future) + stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") + if "+M+SG" in tags: + wdlm_in_s_m_sg.write("%s\n\n" % stxt) + elif "+M+PL" in tags: + wdlm_in_s_m_pl.write("%s\n\n" % stxt) + elif "+F+SG" in tags: + wdlm_in_s_f_sg.write("%s\n\n" % stxt) + else: + wdlm_in_s_f_pl.write("%s\n\n" % stxt) + + elif NonCanonGendMarker(word,tags): # TODO: see the above comment + stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") + if "+M+SG" in tags: + masc_in_a_sg.write("%s\n\n" % stxt) + elif "+F+SG" in tags: + fem_in_o_sg.write("%s\n\n" % stxt) + #else: # discard plural forms + # this generates incorrect plurals of compounds like 'cebeça-chata' (23/01/2020) + #pass + elif "+F+PL" in tags: + fem_in_o_pl.write("%s\n\n" % stxt) + elif "+M+PL" in tags: + masc_in_a_pl.write("%s\n\n" % stxt) + else: + stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") # TODO: tags[:-2] (see above) + if "+M+SG" in tags: + other_m_sg.write("%s\n\n" % stxt) + elif "+M+PL" in tags: + other_m_pl.write("%s\n\n" % stxt) + elif "+F+SG" in tags: + other_f_sg.write("%s\n\n" % stxt) + else: + other_f_pl.write("%s\n\n" % stxt) + +def main(): + for infile in sys.argv[1:]: + entries=extract_entries(infile) + write_entries(entries) + aug_m_sg.close() + aug_m_pl.close() + aug_f_sg.close() + aug_f_pl.close() + wdlm_in_s_m_sg.close() + wdlm_in_s_m_pl.close() + wdlm_in_s_f_sg.close() + wdlm_in_s_f_pl.close() + masc_in_a_sg.close() + fem_in_o_sg.close() + masc_in_a_pl.close() + fem_in_o_pl.close() + other_m_sg.close() + other_m_pl.close() + other_f_sg.close() + other_f_pl.close() + +if __name__ == '__main__': + main()