From ee896cf86c2f49622fb7117f6e4b05f888fbf590 Mon Sep 17 00:00:00 2001
From: "Leonel F. de Alencar" <leonel.de.alencar@ufc.br>
Date: Wed, 19 Feb 2020 16:56:44 -0300
Subject: [PATCH] issue #72 including Python module for the extraction of noun
 and adjective bases

---
 ExtractWordLemmaPairs.py           | 207 +++++++++++++++++++++++++++++
 tools/fst/ExtractWordLemmaPairs.py | 207 +++++++++++++++++++++++++++++
 2 files changed, 414 insertions(+)
 create mode 100755 ExtractWordLemmaPairs.py
 create mode 100755 tools/fst/ExtractWordLemmaPairs.py

diff --git a/ExtractWordLemmaPairs.py b/ExtractWordLemmaPairs.py
new file mode 100755
index 00000000..f80b6df0
--- /dev/null
+++ b/ExtractWordLemmaPairs.py
@@ -0,0 +1,207 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Leonel Figueiredo de Alencar
+# leonel.de.alencar@ufc.br
+# Date: April 20, 2018, updated February 18, 2020
+
+"""This module is the first component of the architecture of a generator of Portuguese diminutives. It extracts possible diminutive formation bases from existing nouns and adjectives encoded in MorphoBr's format, as described in the following paper:
+
+ALENCAR, Leonel Figueiredo de;  CUCONATO , Bruno; RADEMAKER, Alexandre. MorphoBr: an open source large-coverage full-form lexicon for morphological analysis of Portuguese. Texto Livre: Linguagem e Tecnologia, Belo Horizonte, v. 11, n. 3, p. 1-25, set.- dez. 2018. 
+ISSN 1983-3652 
+DOI: 10.17851/1983-3652.11.3.1-25
+http://www.periodicos.letras.ufmg.br/index.php/textolivre/article/view/14294. 
+
+
+Unplausible bases are filtered out, see details below. The extracted bases are converted to spaced-text format and written to different files, according to the classification expected by the finite-state grammar in the morphotactic-grammar.lexc file. Some examples may help clarify the pipeline:
+
+Input in MorphoBr' format:
+
+agulhão	agulha+N+AUG+M+SG
+agulhões	agulha+N+AUG+M+PL
+agulhona	agulha+N+AUG+F+SG
+agulhonas	agulha+N+AUG+F+PL
+
+
+Output generated by this module (written to different files):
+
+a g u l h a +N +AUG
+a g u l h ã o
+
+a g u l h a +N +AUG
+a g u l h õ e s
+
+a g u l h a +N +AUG
+a g u l h o n a
+
+a g u l h a +N +AUG
+a g u l h o n a s
+
+
+"""
+import os, sys, re
+
+EXCLUDE_TAGS=["+DIM","+SUPER"]
+EXTENSION=".stxt"
+
+"""Regex pattern matching itens that can not function as bases for
+morphological derivations. This includes one or more consonants before a space
+at the beginning of a line, for example:
+b       b+N+M+SG
+c       c+N+M+SG
+d       d+N+M+SG
+
+These itens are in fact abbreviations. As such, they cannot feed diminutive formation,
+e.g. *bzinho 'little b' is ungrammatical (the correct form is 'bezinho', from 'bê', the name of
+letter b).
+The regex pattern also matches abbreviations such as 'ha' (for hectare) and chemical symbols ('Ba', 'Ca', etc.).
+"""
+CONS="[bcdfghjklmnpqrstvwxyz]"
+ABB=re.compile(r"(?i)(%s{1,}|%s[aeo])\s" % (CONS,CONS))
+
+aug_m_sg = open("aug_m_sg%s" % EXTENSION,"w")
+aug_m_pl = open("aug_m_pl%s" % EXTENSION,"w")
+aug_f_sg = open("aug_f_sg%s" % EXTENSION,"w")
+aug_f_pl = open("aug_f_pl%s" % EXTENSION,"w")
+
+wdlm_in_s_m_sg = open("wdlm_in_s_m_sg%s" % EXTENSION,"w")
+wdlm_in_s_m_pl = open("wdlm_in_s_m_pl%s" % EXTENSION,"w")
+wdlm_in_s_f_sg = open("wdlm_in_s_f_sg%s" % EXTENSION,"w")
+wdlm_in_s_f_pl = open("wdlm_in_s_f_pl%s" % EXTENSION,"w")
+
+masc_in_a_sg = open("masc_in_a_sg%s" % EXTENSION,"w")
+fem_in_o_sg = open("fem_in_o_sg%s" % EXTENSION,"w")
+masc_in_a_pl = open("masc_in_a_pl%s" % EXTENSION,"w")
+fem_in_o_pl = open("fem_in_o_pl%s" % EXTENSION,"w")
+
+other_m_sg = open("other_m_sg%s" % EXTENSION,"w")
+other_m_pl = open("other_m_pl%s" % EXTENSION,"w")
+other_f_sg = open("other_f_sg%s" % EXTENSION,"w")
+other_f_pl = open("other_f_pl%s" % EXTENSION,"w")
+
+def extract_entries(infile):
+    return [entry.strip().decode("utf-8") for entry in open(infile,"rU").readlines() if ignore_entry(entry.strip()) ]
+
+def split_entry(entry):
+    return re.split(r"\s+",entry)
+
+def exclude_abbr(entry):
+    if ABB.match(entry):
+        return True
+    return False
+
+def exclude_tag(entry):
+    for tag in EXCLUDE_TAGS:
+        if tag in entry:
+            return True
+    return False
+
+def ignore_entry(entry):
+    if entry == "" or exclude_tag(entry) or exclude_abbr(entry):
+        return False
+    else: 
+        return True
+
+def space(word):
+    return " ".join(list(word))
+
+def convert_entry(word,lemma,tags):
+    return "%s %s\n%s" % (space(lemma),"+%s" % " +".join(tags),space(word))
+   
+def parse_entry(entry):
+    word,parse=split_entry(entry)
+    lemma,tags=re.split(r"\+",parse,1)
+    return word,lemma,tags
+
+def WordLemmaInS(word,lemma):
+    if word.endswith("s") and lemma.endswith("s") and word == lemma:
+        return True
+    else: 
+        return False
+
+
+def NonCanonGendMarker(word,tags):
+    if ("-" in word and "+M+PL" in tags and word.endswith("a") 
+        or "-" in word and "+F+PL" in tags and word.endswith("o") 
+        or "+M+SG" in tags and word.endswith("a") 
+        or "+M+PL" in tags and word.endswith("as") # N-N compounds like 'aços-liga'
+        or "+F+SG" in tags and word.endswith("o")
+        or "+F+PL" in tags and word.endswith("os") #  N-N compounds like  'amostras-tipo'
+        ):
+        return True
+    else:
+        return False
+
+def write_entries(entries):
+    for entry in entries:
+        word,lemma,tags=parse_entry(entry)
+        if "+AUG" in tags:
+            stxt=convert_entry(word,lemma,re.split(r"\+",tags)[:2]).encode("utf-8")
+            if "+M+SG" in tags:
+                aug_m_sg.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                aug_m_pl.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                aug_f_sg.write("%s\n\n" % stxt)
+            else:
+                aug_f_pl.write("%s\n\n" % stxt)
+
+        elif WordLemmaInS(word,lemma): # TODO: use re.split(r"\+",tags)[:-2], excluding gender and number tags,
+            # but including other tags besides the category tag (this may be useful in the future)
+            stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") 
+            if "+M+SG" in tags:
+                wdlm_in_s_m_sg.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                wdlm_in_s_m_pl.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                wdlm_in_s_f_sg.write("%s\n\n" % stxt)
+            else:
+                wdlm_in_s_f_pl.write("%s\n\n" % stxt)
+
+        elif NonCanonGendMarker(word,tags): # TODO: see the above comment
+            stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") 
+            if "+M+SG" in tags:
+                masc_in_a_sg.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                fem_in_o_sg.write("%s\n\n" % stxt)
+            #else: # discard plural forms
+                # this generates incorrect plurals of compounds like 'cebeça-chata' (23/01/2020) 
+                #pass
+            elif "+F+PL" in tags:
+                fem_in_o_pl.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                masc_in_a_pl.write("%s\n\n" % stxt)
+        else:
+            stxt=convert_entry(word,lemma,tags[0]).encode("utf-8")  # TODO: tags[:-2] (see above)
+            if "+M+SG" in tags:
+                other_m_sg.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                other_m_pl.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                other_f_sg.write("%s\n\n" % stxt)
+            else:
+                other_f_pl.write("%s\n\n" % stxt)
+
+def main(): 
+    for infile in sys.argv[1:]:
+        entries=extract_entries(infile)
+        write_entries(entries)
+    aug_m_sg.close()
+    aug_m_pl.close()
+    aug_f_sg.close()
+    aug_f_pl.close()
+    wdlm_in_s_m_sg.close()
+    wdlm_in_s_m_pl.close()
+    wdlm_in_s_f_sg.close()
+    wdlm_in_s_f_pl.close()
+    masc_in_a_sg.close()
+    fem_in_o_sg.close()
+    masc_in_a_pl.close()
+    fem_in_o_pl.close()
+    other_m_sg.close()
+    other_m_pl.close()
+    other_f_sg.close()
+    other_f_pl.close()
+
+if __name__ == '__main__':
+	main()
diff --git a/tools/fst/ExtractWordLemmaPairs.py b/tools/fst/ExtractWordLemmaPairs.py
new file mode 100755
index 00000000..f80b6df0
--- /dev/null
+++ b/tools/fst/ExtractWordLemmaPairs.py
@@ -0,0 +1,207 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Leonel Figueiredo de Alencar
+# leonel.de.alencar@ufc.br
+# Date: April 20, 2018, updated February 18, 2020
+
+"""This module is the first component of the architecture of a generator of Portuguese diminutives. It extracts possible diminutive formation bases from existing nouns and adjectives encoded in MorphoBr's format, as described in the following paper:
+
+ALENCAR, Leonel Figueiredo de;  CUCONATO , Bruno; RADEMAKER, Alexandre. MorphoBr: an open source large-coverage full-form lexicon for morphological analysis of Portuguese. Texto Livre: Linguagem e Tecnologia, Belo Horizonte, v. 11, n. 3, p. 1-25, set.- dez. 2018. 
+ISSN 1983-3652 
+DOI: 10.17851/1983-3652.11.3.1-25
+http://www.periodicos.letras.ufmg.br/index.php/textolivre/article/view/14294. 
+
+
+Unplausible bases are filtered out, see details below. The extracted bases are converted to spaced-text format and written to different files, according to the classification expected by the finite-state grammar in the morphotactic-grammar.lexc file. Some examples may help clarify the pipeline:
+
+Input in MorphoBr' format:
+
+agulhão	agulha+N+AUG+M+SG
+agulhões	agulha+N+AUG+M+PL
+agulhona	agulha+N+AUG+F+SG
+agulhonas	agulha+N+AUG+F+PL
+
+
+Output generated by this module (written to different files):
+
+a g u l h a +N +AUG
+a g u l h ã o
+
+a g u l h a +N +AUG
+a g u l h õ e s
+
+a g u l h a +N +AUG
+a g u l h o n a
+
+a g u l h a +N +AUG
+a g u l h o n a s
+
+
+"""
+import os, sys, re
+
+EXCLUDE_TAGS=["+DIM","+SUPER"]
+EXTENSION=".stxt"
+
+"""Regex pattern matching itens that can not function as bases for
+morphological derivations. This includes one or more consonants before a space
+at the beginning of a line, for example:
+b       b+N+M+SG
+c       c+N+M+SG
+d       d+N+M+SG
+
+These itens are in fact abbreviations. As such, they cannot feed diminutive formation,
+e.g. *bzinho 'little b' is ungrammatical (the correct form is 'bezinho', from 'bê', the name of
+letter b).
+The regex pattern also matches abbreviations such as 'ha' (for hectare) and chemical symbols ('Ba', 'Ca', etc.).
+"""
+CONS="[bcdfghjklmnpqrstvwxyz]"
+ABB=re.compile(r"(?i)(%s{1,}|%s[aeo])\s" % (CONS,CONS))
+
+aug_m_sg = open("aug_m_sg%s" % EXTENSION,"w")
+aug_m_pl = open("aug_m_pl%s" % EXTENSION,"w")
+aug_f_sg = open("aug_f_sg%s" % EXTENSION,"w")
+aug_f_pl = open("aug_f_pl%s" % EXTENSION,"w")
+
+wdlm_in_s_m_sg = open("wdlm_in_s_m_sg%s" % EXTENSION,"w")
+wdlm_in_s_m_pl = open("wdlm_in_s_m_pl%s" % EXTENSION,"w")
+wdlm_in_s_f_sg = open("wdlm_in_s_f_sg%s" % EXTENSION,"w")
+wdlm_in_s_f_pl = open("wdlm_in_s_f_pl%s" % EXTENSION,"w")
+
+masc_in_a_sg = open("masc_in_a_sg%s" % EXTENSION,"w")
+fem_in_o_sg = open("fem_in_o_sg%s" % EXTENSION,"w")
+masc_in_a_pl = open("masc_in_a_pl%s" % EXTENSION,"w")
+fem_in_o_pl = open("fem_in_o_pl%s" % EXTENSION,"w")
+
+other_m_sg = open("other_m_sg%s" % EXTENSION,"w")
+other_m_pl = open("other_m_pl%s" % EXTENSION,"w")
+other_f_sg = open("other_f_sg%s" % EXTENSION,"w")
+other_f_pl = open("other_f_pl%s" % EXTENSION,"w")
+
+def extract_entries(infile):
+    return [entry.strip().decode("utf-8") for entry in open(infile,"rU").readlines() if ignore_entry(entry.strip()) ]
+
+def split_entry(entry):
+    return re.split(r"\s+",entry)
+
+def exclude_abbr(entry):
+    if ABB.match(entry):
+        return True
+    return False
+
+def exclude_tag(entry):
+    for tag in EXCLUDE_TAGS:
+        if tag in entry:
+            return True
+    return False
+
+def ignore_entry(entry):
+    if entry == "" or exclude_tag(entry) or exclude_abbr(entry):
+        return False
+    else: 
+        return True
+
+def space(word):
+    return " ".join(list(word))
+
+def convert_entry(word,lemma,tags):
+    return "%s %s\n%s" % (space(lemma),"+%s" % " +".join(tags),space(word))
+   
+def parse_entry(entry):
+    word,parse=split_entry(entry)
+    lemma,tags=re.split(r"\+",parse,1)
+    return word,lemma,tags
+
+def WordLemmaInS(word,lemma):
+    if word.endswith("s") and lemma.endswith("s") and word == lemma:
+        return True
+    else: 
+        return False
+
+
+def NonCanonGendMarker(word,tags):
+    if ("-" in word and "+M+PL" in tags and word.endswith("a") 
+        or "-" in word and "+F+PL" in tags and word.endswith("o") 
+        or "+M+SG" in tags and word.endswith("a") 
+        or "+M+PL" in tags and word.endswith("as") # N-N compounds like 'aços-liga'
+        or "+F+SG" in tags and word.endswith("o")
+        or "+F+PL" in tags and word.endswith("os") #  N-N compounds like  'amostras-tipo'
+        ):
+        return True
+    else:
+        return False
+
+def write_entries(entries):
+    for entry in entries:
+        word,lemma,tags=parse_entry(entry)
+        if "+AUG" in tags:
+            stxt=convert_entry(word,lemma,re.split(r"\+",tags)[:2]).encode("utf-8")
+            if "+M+SG" in tags:
+                aug_m_sg.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                aug_m_pl.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                aug_f_sg.write("%s\n\n" % stxt)
+            else:
+                aug_f_pl.write("%s\n\n" % stxt)
+
+        elif WordLemmaInS(word,lemma): # TODO: use re.split(r"\+",tags)[:-2], excluding gender and number tags,
+            # but including other tags besides the category tag (this may be useful in the future)
+            stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") 
+            if "+M+SG" in tags:
+                wdlm_in_s_m_sg.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                wdlm_in_s_m_pl.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                wdlm_in_s_f_sg.write("%s\n\n" % stxt)
+            else:
+                wdlm_in_s_f_pl.write("%s\n\n" % stxt)
+
+        elif NonCanonGendMarker(word,tags): # TODO: see the above comment
+            stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") 
+            if "+M+SG" in tags:
+                masc_in_a_sg.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                fem_in_o_sg.write("%s\n\n" % stxt)
+            #else: # discard plural forms
+                # this generates incorrect plurals of compounds like 'cebeça-chata' (23/01/2020) 
+                #pass
+            elif "+F+PL" in tags:
+                fem_in_o_pl.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                masc_in_a_pl.write("%s\n\n" % stxt)
+        else:
+            stxt=convert_entry(word,lemma,tags[0]).encode("utf-8")  # TODO: tags[:-2] (see above)
+            if "+M+SG" in tags:
+                other_m_sg.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                other_m_pl.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                other_f_sg.write("%s\n\n" % stxt)
+            else:
+                other_f_pl.write("%s\n\n" % stxt)
+
+def main(): 
+    for infile in sys.argv[1:]:
+        entries=extract_entries(infile)
+        write_entries(entries)
+    aug_m_sg.close()
+    aug_m_pl.close()
+    aug_f_sg.close()
+    aug_f_pl.close()
+    wdlm_in_s_m_sg.close()
+    wdlm_in_s_m_pl.close()
+    wdlm_in_s_f_sg.close()
+    wdlm_in_s_f_pl.close()
+    masc_in_a_sg.close()
+    fem_in_o_sg.close()
+    masc_in_a_pl.close()
+    fem_in_o_pl.close()
+    other_m_sg.close()
+    other_m_pl.close()
+    other_f_sg.close()
+    other_f_pl.close()
+
+if __name__ == '__main__':
+	main()