From 6c109a138117a3d4cb18f39cb5aa7505c4c71c9b Mon Sep 17 00:00:00 2001
From: "Leonel F. de Alencar" <leonel.de.alencar@ufc.br>
Date: Thu, 28 Jun 2018 11:01:30 -0300
Subject: [PATCH 1/3] #53 adding separate delaf=>mbr converter and clitic
 annotator

---
 tools/python-converter/AnnotateClitics.py     |  41 ++++
 tools/python-converter/BuildPairsFromDELAF.py | 167 --------------
 tools/python-converter/ConvertDELAF.py        | 212 ++++++++++++++++++
 3 files changed, 253 insertions(+), 167 deletions(-)
 create mode 100755 tools/python-converter/AnnotateClitics.py
 delete mode 100644 tools/python-converter/BuildPairsFromDELAF.py
 create mode 100755 tools/python-converter/ConvertDELAF.py

diff --git a/tools/python-converter/AnnotateClitics.py b/tools/python-converter/AnnotateClitics.py
new file mode 100755
index 00000000..7520eb31
--- /dev/null
+++ b/tools/python-converter/AnnotateClitics.py
@@ -0,0 +1,41 @@
+#! /usr/bin/env python2.7
+# -*- coding: utf-8 -*-
+
+# Author: Leonel Figueiredo de Alencar - Federal University of Ceará
+# leonel.de.alencar@ufc.br
+# Date: June 27, 2018
+"""
+This module annotates enclitic or mesoclitic pronouns in entries in the MBR format
+
+Usage: cat infile.mbr | AnnotateClitics.py > outfile.mbr
+
+It reads entries in the MBR format from standard input and substitutes
+the +PRO tag for tags representing the lemma and morpho-syntactic features
+of the pronouns. The result is written to standard output.
+For example, an entry like
+
+degustares-lhe	degustar+V+PRO+INF+2+SG
+
+is converted to
+
+degustares-lhe	degustar+V.ele.DAT.3.SG+SBJF+2+SG
+
+Tag conversion is performed by the AnnotateClitic function from
+the module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled. 
+For more details, see the respective module documentation.
+"""
+import sys
+from ConvertDELAF import *
+
+
+def main():
+    entries=ExtractEntries(sys.stdin)
+    for entry in entries:
+        if HasClitic(entry):
+            parts=ParseEntry(entry,r"\t|\+")
+            word,lemma,cat,feats=parts[0],parts[1],parts[2],parts[4:]
+            print AnnotateClitic(word,lemma,cat,feats).encode("utf-8")
+        else:
+            print entry.encode("utf-8")
+if __name__ == '__main__':
+	main()
diff --git a/tools/python-converter/BuildPairsFromDELAF.py b/tools/python-converter/BuildPairsFromDELAF.py
deleted file mode 100644
index b80dd04a..00000000
--- a/tools/python-converter/BuildPairsFromDELAF.py
+++ /dev/null
@@ -1,167 +0,0 @@
-#! /usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# Author: Leonel Figueiredo de Alencar
-# leonel.de.alencar@ufc.br
-# Date: Mai 8, 2018
-
-# Usage: python BuildPairsFromDELAF.py infile1 [infile2 ... infilen]
-
-""""Module for building word-parse pairs from DELAF-PB, converting entries in the format
-
-agendas,agenda.N:fp
-
-to an intermediate format used to compile lexical transducers with XFST or FOMA, using the respective
-spaced-text compilers:
-
-agendas	agenda+N+F+PL
-
-The module also substitutes the PRO tag in verb forms with an enclitic or mesoclitic pronoun for tags 
-representing the lemma and morpho-syntactic features of these pronouns. Thus, convertions like the following
-are performed:
-
-degustares-lhe,degustar.V+PRO:W2s
-degustares-lhe	degustar+V.ele.DAT.3.SG+SBJF+2+SG
-
-Entries in this intermediate format, in turn, can be converted to spaced-text using the BuildSpacedText.py module.
-The tag convertions are based on the mappings defined in the dictionaries stored in the global variables TAG_MAPPING and CLITIC_MAPPING.
-The regular expression pattern in the global variable SEPARATOR is used for splitting 
-entries in its component parts, i.e. word form, lemma, lexical category and morpho-syntactic features:
-
-mantinham manter V I3p
-
-This separator also handles incorrectly encoded entries
-from DELAF-PB like the following, where the first colon is spurious:
-
-mantinhas:,manter.V:I2s
-
-A spurious hyphen in cases like abstinhas:-lhe,abster.V+PRO:I2s is eliminated by the function ConvertEntry()
-"""
-
-import re,sys,os
-from cPickle import load,dump
-from BuildSpacedText import extract_entries
-
-SEPARATOR=r"[,.:]+"
-# edit paths to point to your local files
-PATH_TO_MAPPING_FILE=os.path.expanduser("~/morphtools/tag_mapping.txt")
-PATH_TO_MAPPING_FILE2=os.path.expanduser("~/morphtools/clitics.txt")
-
-PRO="+PRO"
-
-def UnpickleMapping(infile):
-    f=open(infile,"rb")
-    dic=load(f)
-    f.close()
-    return dic
-
-# edit paths to point to your local files
-TAG_MAPPING=UnpickleMapping(os.path.expanduser("~/morphtools_data/tag_mapping.pkl"))
-CLITIC_MAPPING=UnpickleMapping(os.path.expanduser("~/morphtools_data/clitics.pkl"))
-
-def ExtractMapping(mapping_file=PATH_TO_MAPPING_FILE):
-    return extract_entries(mapping_file)
-
-def ConvertTagsetFromFile(mapping_file=PATH_TO_MAPPING_FILE):
-    tag_mapping=dict()
-    lines=ExtractMapping(mapping_file)
-    for line in lines:
-        tags=re.split(r"\s+",line)[:2]
-	tag_mapping[tags[0]]=tags[1]
-    return tag_mapping
-
-#def CliticMapping(infile):
-#    return ConvertTagsetFromFile(infile)
-
-#CLITIC_MAPPING=CliticMapping(PATH_TO_MAPPING_FILE2)
-
-def PickleMapping(mapping,outfile):
-    f=open(outfile,"wb")
-    dump(mapping,f,-1)
-    f.close()
-
-def BuildMappings(*infiles):
-    for f in infiles:
-        PickleMapping(ConvertTagsetFromFile(f),"%s.pkl" % f.split(".")[0])
-
-def HasClitic(cat):
-    return PRO in cat
-
-def ExtractClitic(word):
-    for k in CLITIC_MAPPING.keys():
-        # a clitic immediately follows a hyphen in word end position in enclisis or
-        # appears between hyphens in the case of mesoclisis
-	if re.search(r"-%s(-|\b)" % k, word):
-		return k
-
-def EndsInNasalDiphthong(word):
-    pattern=r".+([õã][eo]\b|m\b)".decode("utf-8")
-    if re.match(pattern,word):
-        return True
-    else:
-        return False
-
-def AppendCliticFeatures(word,cat):
-    return "%s.%s" % (cat.split(PRO)[0], CLITIC_MAPPING.get(ExtractClitic(word)))
-
-def ConcatenateFeatures(dic,feats):
-    return "+".join([dic[f] for f in list(feats)])
-
-def ConvertEntry(entry,dic):
-    # eliminating spurious hyphen in cases like abstinhas:-lhe,abster.V+PRO:I2s
-    error=":-"
-    corr="-"
-    if error in entry:
-        entry=entry.replace(error,corr)
-    parts=re.split(SEPARATOR,entry)
-    # if cat is an uninflected category like ADV
-    if len(parts) == 3:
-        word,lemma,cat=parts
-        return "%s\t%s+%s" % (word,lemma,cat)
-    # if entry is split in less than 3 or more than 4 parts,
-    # then there must be something wrong in entry (e.g. due to bad formating)
-    if len(parts) != 4:
-        print entry
-        return None
-    word,lemma,cat,feats=parts
-    # print parts
-    # if +PRO in cat, then delete +PRO and append clitic features to cat
-    if HasClitic(cat):
-        cat_list= AppendCliticFeatures(word,cat).split("/")
-        # print cat_list
-        # handling ambiguity of clitic "nos" after nasal diphthong
-        # if the clitic has two feature sets
-        if len(cat_list)==2:
-            # inserting category label into second features list
-            cat_list[1]="%s.%s" % (cat_list[0].split(".")[0],cat_list[1])
-            # if this condition is true, then clitic has two readings;
-            # else it has only the second reading
-            if EndsInNasalDiphthong(word):
-                # print word
-                entries=[]
-                for c in cat_list:
-                    entries.append("%s\t%s+%s+%s" % (word,lemma,c,ConcatenateFeatures(dic,feats)))
-                return "\n".join(entries)
-            else:
-                cat=cat_list[1]
-        else:
-            cat=cat_list[0]
-    return "%s\t%s+%s+%s" % (word,lemma,cat,ConcatenateFeatures(dic,feats))
-
-def ConvertEntriesFromFile(infile,tag_mapping=TAG_MAPPING):
-    entries=extract_entries(infile)
-    return [ConvertEntry(e,tag_mapping) for e in entries]
-    
-
-def WriteFile(entries,infile):
-    f=open("%s.pairs" % infile, "w")
-    for e in entries:
-        f.write("%s\n" % e.encode("utf-8"))
-    f.close()
-
-def main(): 
-    for file in sys.argv[1:]:
-        WriteFile(ConvertEntriesFromFile(file),file)
-
-if __name__ == '__main__':
-	main()
diff --git a/tools/python-converter/ConvertDELAF.py b/tools/python-converter/ConvertDELAF.py
new file mode 100755
index 00000000..47f2eb9d
--- /dev/null
+++ b/tools/python-converter/ConvertDELAF.py
@@ -0,0 +1,212 @@
+#! /usr/bin/env python2.7
+# -*- coding: utf-8 -*-
+
+# Author: Leonel Figueiredo de Alencar - Federal University of Ceará
+# leonel.de.alencar@ufc.br
+# Date: June 27, 2018
+
+# Usage: ConvertDELAF.py infile1 [infile_2 ... infile_n]
+
+""""Module for building word-parse pairs from DELAF-PB, converting entries in the format
+
+agendas,agenda.N:fp
+
+to MorphoBr format:
+
+agendas	agenda+N+F+PL
+
+Ouput files have the extension .mbr. 
+Entries in the MBR format can be converted to spaced-text with the BuildSpacedText.py module. Spaced-text, in turn, compiles into lexical transducers with XFST or FOMA, using the respective spaced-text compilers.
+Tag conversions are based on the mappings defined in the dictionaries stored in the global variables TAG_MAPPING and CLITIC_MAPPING. Put these files in the directory specified in DATA_DIR or edit this variable to point to where these files are in your system. 
+
+The regular expression pattern in the global variable SEPARATOR is used for splitting 
+entries in its component parts, i.e. word form, lemma, lexical category and morpho-syntactic features, e.g.:
+
+mantinham manter V I3p
+
+This separator also handles incorrectly encoded entries
+from DELAF-PB like the following, where the first colon is spurious:
+
+mantinhas:,manter.V:I2s
+
+A spurious colon in cases like abstinhas:-lhe,abster.V+PRO:I2s is eliminated by the function CorrectEntry()
+"""
+
+import re,sys
+from os.path import join,expanduser
+from cPickle import load,dump
+
+DATA_DIR=expanduser("~/morphtools_data")
+TAGS="tag_mapping"
+CLITICS="clitics"
+PATH_TO_MAPPING_FILE=join(DATA_DIR,"%s.%s" % (TAGS,"txt"))
+PATH_TO_MAPPING_FILE2=join(DATA_DIR,"%s.%s" % (CLITICS,"txt"))
+EXTENSION="mbr" # output file extension
+SEPARATOR=r"[,.:]+"
+PRO="PRO"
+
+def UnpickleMapping(infile):
+    f=open(infile,"rb")
+    dic=load(f)
+    f.close()
+    return dic
+
+TAG_MAPPING=UnpickleMapping(join(DATA_DIR,"%s.%s" % (TAGS,"pkl")))
+CLITIC_MAPPING=UnpickleMapping(join(DATA_DIR,"%s.%s" % (CLITICS,"pkl")))
+
+def OpenFile(filename):
+    return open(filename,"rU")
+
+def IgnoreLine(line):
+	return len(line.strip()) > 0 and not line.strip().startswith("#")
+
+def ExtractEntries(file):
+	return [line.strip().decode("utf-8") for line in file if IgnoreLine(line)]
+
+def ExtractMapping(filename=PATH_TO_MAPPING_FILE):
+    return ExtractEntries(OpenFile(filename))
+
+def ConvertTagsetFromFile(filename=PATH_TO_MAPPING_FILE):
+    tag_mapping=dict()
+    lines=ExtractMapping(filename)
+    for line in lines:
+        tags=re.split(r"\s+",line)[:2]
+	tag_mapping[tags[0]]=tags[1]
+    return tag_mapping
+
+def PickleMapping(mapping,outfile):
+    f=open(outfile,"wb")
+    dump(mapping,f,-1)
+    f.close()
+
+def BuildMappings(*infiles):
+    for f in infiles:
+        PickleMapping(ConvertTagsetFromFile(f),"%s.pkl" % f.split(".")[0])
+
+def HasClitic(entry):
+    return "+%s+" % PRO in entry
+
+def ExtractClitic(word):
+    for k in CLITIC_MAPPING.keys():
+        # a clitic immediately follows a hyphen in word end position in enclisis or
+        # appears between hyphens in the case of mesoclisis
+	if re.search(r"-%s(-|\b)" % k, word):
+		return k
+
+def EndsInNasalDiphthong(word):
+    pattern=r".+([õã][eo]\b|m\b)".decode("utf-8")
+    if re.match(pattern,word):
+        return True
+    else:
+        return False
+
+def AppendCliticFeatures(word,cat):
+    return "%s.%s" % (cat, CLITIC_MAPPING.get(ExtractClitic(word)))
+
+def ConcatenateFeatures(feats):
+    return "+".join(feats)
+
+def ConvertFeatures(feats,dic=TAG_MAPPING):
+    return [dic.get(f,f) for f in feats]
+
+def CorrectEntry(entry):
+    "Eliminate spurious colon in cases like abstinhas:-lhe,abster.V+PRO:I2s"
+    error=":-"
+    corr="-"
+    if error in entry:
+        entry=entry.replace(error,corr)
+    return entry
+
+def EditEntry(entry):
+    "Substitute ambiguous letter S, which represents both the superlative of adjectives and the present subjunctive of verbs, for E in the former case."
+    return re.sub(r":S([mf][sp]$)",r":E\1",entry)
+
+def ParseEntry(entry,sep=SEPARATOR):
+    return re.split(sep,EditEntry(CorrectEntry(entry)))
+
+def AnnotateClitic(word,lemma,cat,feats):
+    """Substitutes the PRO tag in verb forms with an enclitic or mesoclitic pronoun for tags representing the lemma and morpho-syntactic features of these pronouns. Thus, conversions like the following are performed:
+
+degustares-lhe	degustar+V+PRO+INF+2+SG
+degustares-lhe	degustar+V.ele.DAT.3.SG+SBJF+2+SG
+
+The ambiguity of forms with the clitic pronoun "nos" is handled by means of the tag mapping CLITIC_MAPPING
+
+nos	ele.ACC.3.M.PL/nós.AD.1.PL
+
+Thus,an ambiguous entry like 
+
+vindimam-nos	vindimar+V+PRO+PRS+3+PL
+
+is split into two separate entries
+
+vindimam-nos	vindimar+V.ele.ACC.3.M.PL+PRS+3+PL
+vindimam-nos	vindimar+V.nós.AD.1.PL+PRS+3+PL
+
+In forms with a non-ambiguous "nos", the clitic is mapped to AD.1.PL+PRS+3+PL. Thus, an entry like
+
+vindicávei-nos	vindicar+V+PRO+IMPF+2+PL
+
+is converted to
+
+vindicávamo-nos vindicar+nós.AD.1.PL+IMPF+1+PL
+
+    """
+    # if clitic maps to "ele.ACC.3.M.PL/nós.AD.1.PL", then the length of cat_list is 2;
+    # else, it is 1
+    cat_list= AppendCliticFeatures(word,cat).split("/")
+    # if the clitic has two feature sets, then the ambiguity of clitic "nos"
+    # after nasal diphthong must be handled
+    if len(cat_list)==2:
+        # inserting category label into second list of features
+        cat_list[1]="%s.%s" % (cat_list[0].split(".")[0],cat_list[1])
+        # if this condition is true, then clitic has two readings;
+        # else it only has the second reading
+        if EndsInNasalDiphthong(word):
+            # print word
+            entries=[]
+            for c in cat_list:
+                entries.append("%s\t%s+%s+%s" % (word,lemma,c,ConcatenateFeatures(feats)))
+            return "\n".join(entries)
+        else:
+            cat=cat_list[1]
+    else:
+            cat=cat_list[0]
+    return "%s\t%s+%s+%s" % (word,lemma,cat,ConcatenateFeatures(feats))
+
+def toString(word,lemma,cat,feats):
+    return "%s\t%s+%s+%s" % (word,
+                             lemma,
+                             cat,
+                             ConcatenateFeatures(feats))
+
+def ConvertEntry(entry):
+    parts=ParseEntry(entry)
+    # if cat is an uninflected category like ADV
+    if len(parts) == 3:
+        word,lemma,cat=parts
+        return "%s\t%s+%s" % (word,lemma,cat)
+    # if entry is split in less than 3 or more than 4 parts,
+    # then there must be something wrong in entry (e.g. due to bad formating)
+    if len(parts) != 4:
+        print entry
+        return None
+    word,lemma,cat,feats=parts
+    return toString(word,lemma,cat,ConvertFeatures(feats))
+
+def ConvertEntriesFromFile(filename):
+    entries=ExtractEntries(OpenFile(filename))
+    return [ConvertEntry(e) for e in entries]
+    
+def WriteFile(entries,infile):
+    f=open("%s.%s" % (infile,EXTENSION), "w")
+    for e in entries:
+        f.write("%s\n" % e.encode("utf-8"))
+    f.close()
+
+def main(): 
+    for file in sys.argv[1:]:
+        WriteFile(ConvertEntriesFromFile(file),file)
+
+if __name__ == '__main__':
+	main()

From 8465a8f236ebb637fabaffc6e71f154fbd70d859 Mon Sep 17 00:00:00 2001
From: "Leonel F. de Alencar" <leonel.de.alencar@ufc.br>
Date: Thu, 28 Jun 2018 11:58:29 -0300
Subject: [PATCH 2/3] #53 new tag mapping files

---
 tools/python-converter/tag_mapping.pkl | Bin 312 -> 328 bytes
 tools/python-converter/tag_mapping.txt |  10 +++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/python-converter/tag_mapping.pkl b/tools/python-converter/tag_mapping.pkl
index 314819a81bf893916ffa6a86257110e584a63870..bc740d5347f67f6ee506447629c5f274cdf2a2ea 100644
GIT binary patch
delta 21
ccmdnNbb@Ju54URsD+2>VaA<&Q&_wr-076~{W&i*H

delta 9
QcmX@Xw1a7a&%~HF027P^IsgCw

diff --git a/tools/python-converter/tag_mapping.txt b/tools/python-converter/tag_mapping.txt
index 0e380af0..30f6a4eb 100644
--- a/tools/python-converter/tag_mapping.txt
+++ b/tools/python-converter/tag_mapping.txt
@@ -1,9 +1,13 @@
-# this table is used for building a Python dictionary with the 
-# module BuildPairsFromDELAF
+# Convertion table between the DELAF_PB computational dictionary and the BrMorph tagsets
+
+# Ambiguous letter S, which represents both the superlative of adjectives and the present subjunctive of verbs, 
+# is substituted for E in the former case.
+# For information on DELAF_PB, visit: http://www.nilc.icmc.usp.br/nilc/projects/unitex-pb/web/dicionarios.html
+# BrMorph generally follows the format proposed by Beesley and Karttunen's XFST book, using a tagset based on the Leipzig Glossing Rules and other abbrevations for morphological glosses commonly adopted in the linguistic literature: http://www.llf.cnrs.fr/sites/llf.cnrs.fr/files/statiques/Abreviations_gloses-fra.pdf
 
 D DIM	  diminutive
 A AUG	  augmentative
-S SUPER	  superlative
+E SUPER	  superlative
 m M	  masculine
 s SG	  singular
 f F	  feminine

From 1fb3897e73bcc19a0f96d161597ebb4c791c1c12 Mon Sep 17 00:00:00 2001
From: "Leonel F. de Alencar" <leonel.de.alencar@ufc.br>
Date: Mon, 2 Jul 2018 12:39:59 -0300
Subject: [PATCH 3/3] #42 #48 #53 Python module for separating clitics from
 verbs; minor changes in the other Python modules

---
 tools/prepare-delaf.sh                        |   2 +-
 .../AnnotateClitics.py                        |   9 +++--
 .../ConvertDELAF.py                           |  10 +++++
 tools/python-tools/SeparateHyphen.py          |  36 ++++++++++++++++++
 .../clitics.pkl                               | Bin
 .../clitics.txt                               |   0
 .../tag_mapping.pkl                           | Bin
 .../tag_mapping.txt                           |   0
 8 files changed, 52 insertions(+), 5 deletions(-)
 rename tools/{python-converter => python-tools}/AnnotateClitics.py (81%)
 rename tools/{python-converter => python-tools}/ConvertDELAF.py (90%)
 create mode 100755 tools/python-tools/SeparateHyphen.py
 rename tools/{python-converter => python-tools}/clitics.pkl (100%)
 rename tools/{python-converter => python-tools}/clitics.txt (100%)
 rename tools/{python-converter => python-tools}/tag_mapping.pkl (100%)
 rename tools/{python-converter => python-tools}/tag_mapping.txt (100%)

diff --git a/tools/prepare-delaf.sh b/tools/prepare-delaf.sh
index 25d72a14..5d1b8e37 100644
--- a/tools/prepare-delaf.sh
+++ b/tools/prepare-delaf.sh
@@ -30,5 +30,5 @@ grep -F ".N:" $1 | # select nouns
 grep -F ".V+PRO:" $1 | # select verbs with clitics
 # rm spurious colon like in abstinhas:-lhe,abster.V+PRO:I2s
     sed "s/:-/-/" |
-    splitW31 > delaf.clitics
+    splitW31 | SeparateHyphen.py > delaf.clitics
 
diff --git a/tools/python-converter/AnnotateClitics.py b/tools/python-tools/AnnotateClitics.py
similarity index 81%
rename from tools/python-converter/AnnotateClitics.py
rename to tools/python-tools/AnnotateClitics.py
index 7520eb31..98760637 100755
--- a/tools/python-converter/AnnotateClitics.py
+++ b/tools/python-tools/AnnotateClitics.py
@@ -3,7 +3,7 @@
 
 # Author: Leonel Figueiredo de Alencar - Federal University of Ceará
 # leonel.de.alencar@ufc.br
-# Date: June 27, 2018
+# Date: July 2, 2018
 """
 This module annotates enclitic or mesoclitic pronouns in entries in the MBR format
 
@@ -21,7 +21,7 @@
 degustares-lhe	degustar+V.ele.DAT.3.SG+SBJF+2+SG
 
 Tag conversion is performed by the AnnotateClitic function from
-the module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled. 
+module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled. 
 For more details, see the respective module documentation.
 """
 import sys
@@ -34,8 +34,9 @@ def main():
         if HasClitic(entry):
             parts=ParseEntry(entry,r"\t|\+")
             word,lemma,cat,feats=parts[0],parts[1],parts[2],parts[4:]
-            print AnnotateClitic(word,lemma,cat,feats).encode("utf-8")
+            sys.stdout.write("%s\n" % AnnotateClitic(word,lemma,cat,feats).encode("utf-8"))
         else:
-            print entry.encode("utf-8")
+            sys.stdout.write("%s\n" % entry.encode("utf-8"))
+
 if __name__ == '__main__':
 	main()
diff --git a/tools/python-converter/ConvertDELAF.py b/tools/python-tools/ConvertDELAF.py
similarity index 90%
rename from tools/python-converter/ConvertDELAF.py
rename to tools/python-tools/ConvertDELAF.py
index 47f2eb9d..78d1263b 100755
--- a/tools/python-converter/ConvertDELAF.py
+++ b/tools/python-tools/ConvertDELAF.py
@@ -44,6 +44,9 @@
 EXTENSION="mbr" # output file extension
 SEPARATOR=r"[,.:]+"
 PRO="PRO"
+# PATTERN=r"(^[^-]+)(vos|n[oa]s?|l[oa]s?|lhes?|me|se|te)(,.+\V\+PRO)"
+PATTERN1=r"(^[^-]+)(vos|n[oa]s?|l[oa]s?|lhes?|me|se|te)(,)"
+PATTERN2=r"(^[^-]+)([oa]s?)(,)"
 
 def UnpickleMapping(infile):
     f=open(infile,"rb")
@@ -109,6 +112,13 @@ def ConcatenateFeatures(feats):
 def ConvertFeatures(feats,dic=TAG_MAPPING):
     return [dic.get(f,f) for f in feats]
 
+def SeparateClitic(entry):
+    """Separate clitic from verb form in entries like abluirlhe,abluir.V+PRO:U1s,
+    returning entries like abluir-lhe,abluir.V+PRO:U1s. Clitic separation is performed in two steps: first, clitics beginning with a consonant are separated; then, clitics beginning with a vowel are separated. This is necessary to prevent unwanted separations like
+zuirn-os,zuir.V+PRO:W3s instead of zuir-nos,zuir.V+PRO:W3s, since the form of latter
+type clitics are contained in the ones of the former."""
+    return re.sub(PATTERN2,r"\1-\2\3",re.sub(PATTERN1,r"\1-\2\3",entry))
+
 def CorrectEntry(entry):
     "Eliminate spurious colon in cases like abstinhas:-lhe,abster.V+PRO:I2s"
     error=":-"
diff --git a/tools/python-tools/SeparateHyphen.py b/tools/python-tools/SeparateHyphen.py
new file mode 100755
index 00000000..aef91574
--- /dev/null
+++ b/tools/python-tools/SeparateHyphen.py
@@ -0,0 +1,36 @@
+#! /usr/bin/env python2.7
+# -*- coding: utf-8 -*-
+
+# Author: Leonel Figueiredo de Alencar - Federal University of Ceará
+# leonel.de.alencar@ufc.br
+# Date: July 2, 2018
+
+"""
+This module correct DELAF entries with the V+PRO tag from standard input
+by inserting the missing hyphen separating the clitic pronoun from
+the verb form in entries like the following:
+
+abluirlhe,abluir.V+PRO:U1s
+
+The output are correct entries, e.g.: 
+
+abluir-lhe,abluir.V+PRO:U1s
+
+Usage: cat INFILE | SeparateHyphen.py > OUTFILE
+
+The module uses the SeparateClitic function from module ConvertDELAF.
+Clitic separation is performed using PATTERN1, which presupposes that the
+input entries contain the V+PRO tag.
+"""
+
+import sys
+from ConvertDELAF import *
+
+
+def main():
+    entries=ExtractEntries(sys.stdin)
+    for entry in entries:
+        sys.stdout.write("%s\n" % SeparateClitic(entry).encode("utf-8"))
+
+if __name__ == '__main__':
+	main()
diff --git a/tools/python-converter/clitics.pkl b/tools/python-tools/clitics.pkl
similarity index 100%
rename from tools/python-converter/clitics.pkl
rename to tools/python-tools/clitics.pkl
diff --git a/tools/python-converter/clitics.txt b/tools/python-tools/clitics.txt
similarity index 100%
rename from tools/python-converter/clitics.txt
rename to tools/python-tools/clitics.txt
diff --git a/tools/python-converter/tag_mapping.pkl b/tools/python-tools/tag_mapping.pkl
similarity index 100%
rename from tools/python-converter/tag_mapping.pkl
rename to tools/python-tools/tag_mapping.pkl
diff --git a/tools/python-converter/tag_mapping.txt b/tools/python-tools/tag_mapping.txt
similarity index 100%
rename from tools/python-converter/tag_mapping.txt
rename to tools/python-tools/tag_mapping.txt