From 6c109a138117a3d4cb18f39cb5aa7505c4c71c9b Mon Sep 17 00:00:00 2001 From: "Leonel F. de Alencar" Date: Thu, 28 Jun 2018 11:01:30 -0300 Subject: [PATCH 1/3] #53 adding separate delaf=>mbr converter and clitic annotator --- tools/python-converter/AnnotateClitics.py | 41 ++++ tools/python-converter/BuildPairsFromDELAF.py | 167 -------------- tools/python-converter/ConvertDELAF.py | 212 ++++++++++++++++++ 3 files changed, 253 insertions(+), 167 deletions(-) create mode 100755 tools/python-converter/AnnotateClitics.py delete mode 100644 tools/python-converter/BuildPairsFromDELAF.py create mode 100755 tools/python-converter/ConvertDELAF.py diff --git a/tools/python-converter/AnnotateClitics.py b/tools/python-converter/AnnotateClitics.py new file mode 100755 index 00000000..7520eb31 --- /dev/null +++ b/tools/python-converter/AnnotateClitics.py @@ -0,0 +1,41 @@ +#! /usr/bin/env python2.7 +# -*- coding: utf-8 -*- + +# Author: Leonel Figueiredo de Alencar - Federal University of Ceará +# leonel.de.alencar@ufc.br +# Date: June 27, 2018 +""" +This module annotates enclitic or mesoclitic pronouns in entries in the MBR format + +Usage: cat infile.mbr | AnnotateClitics.py > outfile.mbr + +It reads entries in the MBR format from standard input and substitutes +the +PRO tag for tags representing the lemma and morpho-syntactic features +of the pronouns. The result is written to standard output. +For example, an entry like + +degustares-lhe degustar+V+PRO+INF+2+SG + +is converted to + +degustares-lhe degustar+V.ele.DAT.3.SG+SBJF+2+SG + +Tag conversion is performed by the AnnotateClitic function from +the module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled. +For more details, see the respective module documentation. +""" +import sys +from ConvertDELAF import * + + +def main(): + entries=ExtractEntries(sys.stdin) + for entry in entries: + if HasClitic(entry): + parts=ParseEntry(entry,r"\t|\+") + word,lemma,cat,feats=parts[0],parts[1],parts[2],parts[4:] + print AnnotateClitic(word,lemma,cat,feats).encode("utf-8") + else: + print entry.encode("utf-8") +if __name__ == '__main__': + main() diff --git a/tools/python-converter/BuildPairsFromDELAF.py b/tools/python-converter/BuildPairsFromDELAF.py deleted file mode 100644 index b80dd04a..00000000 --- a/tools/python-converter/BuildPairsFromDELAF.py +++ /dev/null @@ -1,167 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -# Author: Leonel Figueiredo de Alencar -# leonel.de.alencar@ufc.br -# Date: Mai 8, 2018 - -# Usage: python BuildPairsFromDELAF.py infile1 [infile2 ... infilen] - -""""Module for building word-parse pairs from DELAF-PB, converting entries in the format - -agendas,agenda.N:fp - -to an intermediate format used to compile lexical transducers with XFST or FOMA, using the respective -spaced-text compilers: - -agendas agenda+N+F+PL - -The module also substitutes the PRO tag in verb forms with an enclitic or mesoclitic pronoun for tags -representing the lemma and morpho-syntactic features of these pronouns. Thus, convertions like the following -are performed: - -degustares-lhe,degustar.V+PRO:W2s -degustares-lhe degustar+V.ele.DAT.3.SG+SBJF+2+SG - -Entries in this intermediate format, in turn, can be converted to spaced-text using the BuildSpacedText.py module. -The tag convertions are based on the mappings defined in the dictionaries stored in the global variables TAG_MAPPING and CLITIC_MAPPING. -The regular expression pattern in the global variable SEPARATOR is used for splitting -entries in its component parts, i.e. word form, lemma, lexical category and morpho-syntactic features: - -mantinham manter V I3p - -This separator also handles incorrectly encoded entries -from DELAF-PB like the following, where the first colon is spurious: - -mantinhas:,manter.V:I2s - -A spurious hyphen in cases like abstinhas:-lhe,abster.V+PRO:I2s is eliminated by the function ConvertEntry() -""" - -import re,sys,os -from cPickle import load,dump -from BuildSpacedText import extract_entries - -SEPARATOR=r"[,.:]+" -# edit paths to point to your local files -PATH_TO_MAPPING_FILE=os.path.expanduser("~/morphtools/tag_mapping.txt") -PATH_TO_MAPPING_FILE2=os.path.expanduser("~/morphtools/clitics.txt") - -PRO="+PRO" - -def UnpickleMapping(infile): - f=open(infile,"rb") - dic=load(f) - f.close() - return dic - -# edit paths to point to your local files -TAG_MAPPING=UnpickleMapping(os.path.expanduser("~/morphtools_data/tag_mapping.pkl")) -CLITIC_MAPPING=UnpickleMapping(os.path.expanduser("~/morphtools_data/clitics.pkl")) - -def ExtractMapping(mapping_file=PATH_TO_MAPPING_FILE): - return extract_entries(mapping_file) - -def ConvertTagsetFromFile(mapping_file=PATH_TO_MAPPING_FILE): - tag_mapping=dict() - lines=ExtractMapping(mapping_file) - for line in lines: - tags=re.split(r"\s+",line)[:2] - tag_mapping[tags[0]]=tags[1] - return tag_mapping - -#def CliticMapping(infile): -# return ConvertTagsetFromFile(infile) - -#CLITIC_MAPPING=CliticMapping(PATH_TO_MAPPING_FILE2) - -def PickleMapping(mapping,outfile): - f=open(outfile,"wb") - dump(mapping,f,-1) - f.close() - -def BuildMappings(*infiles): - for f in infiles: - PickleMapping(ConvertTagsetFromFile(f),"%s.pkl" % f.split(".")[0]) - -def HasClitic(cat): - return PRO in cat - -def ExtractClitic(word): - for k in CLITIC_MAPPING.keys(): - # a clitic immediately follows a hyphen in word end position in enclisis or - # appears between hyphens in the case of mesoclisis - if re.search(r"-%s(-|\b)" % k, word): - return k - -def EndsInNasalDiphthong(word): - pattern=r".+([õã][eo]\b|m\b)".decode("utf-8") - if re.match(pattern,word): - return True - else: - return False - -def AppendCliticFeatures(word,cat): - return "%s.%s" % (cat.split(PRO)[0], CLITIC_MAPPING.get(ExtractClitic(word))) - -def ConcatenateFeatures(dic,feats): - return "+".join([dic[f] for f in list(feats)]) - -def ConvertEntry(entry,dic): - # eliminating spurious hyphen in cases like abstinhas:-lhe,abster.V+PRO:I2s - error=":-" - corr="-" - if error in entry: - entry=entry.replace(error,corr) - parts=re.split(SEPARATOR,entry) - # if cat is an uninflected category like ADV - if len(parts) == 3: - word,lemma,cat=parts - return "%s\t%s+%s" % (word,lemma,cat) - # if entry is split in less than 3 or more than 4 parts, - # then there must be something wrong in entry (e.g. due to bad formating) - if len(parts) != 4: - print entry - return None - word,lemma,cat,feats=parts - # print parts - # if +PRO in cat, then delete +PRO and append clitic features to cat - if HasClitic(cat): - cat_list= AppendCliticFeatures(word,cat).split("/") - # print cat_list - # handling ambiguity of clitic "nos" after nasal diphthong - # if the clitic has two feature sets - if len(cat_list)==2: - # inserting category label into second features list - cat_list[1]="%s.%s" % (cat_list[0].split(".")[0],cat_list[1]) - # if this condition is true, then clitic has two readings; - # else it has only the second reading - if EndsInNasalDiphthong(word): - # print word - entries=[] - for c in cat_list: - entries.append("%s\t%s+%s+%s" % (word,lemma,c,ConcatenateFeatures(dic,feats))) - return "\n".join(entries) - else: - cat=cat_list[1] - else: - cat=cat_list[0] - return "%s\t%s+%s+%s" % (word,lemma,cat,ConcatenateFeatures(dic,feats)) - -def ConvertEntriesFromFile(infile,tag_mapping=TAG_MAPPING): - entries=extract_entries(infile) - return [ConvertEntry(e,tag_mapping) for e in entries] - - -def WriteFile(entries,infile): - f=open("%s.pairs" % infile, "w") - for e in entries: - f.write("%s\n" % e.encode("utf-8")) - f.close() - -def main(): - for file in sys.argv[1:]: - WriteFile(ConvertEntriesFromFile(file),file) - -if __name__ == '__main__': - main() diff --git a/tools/python-converter/ConvertDELAF.py b/tools/python-converter/ConvertDELAF.py new file mode 100755 index 00000000..47f2eb9d --- /dev/null +++ b/tools/python-converter/ConvertDELAF.py @@ -0,0 +1,212 @@ +#! /usr/bin/env python2.7 +# -*- coding: utf-8 -*- + +# Author: Leonel Figueiredo de Alencar - Federal University of Ceará +# leonel.de.alencar@ufc.br +# Date: June 27, 2018 + +# Usage: ConvertDELAF.py infile1 [infile_2 ... infile_n] + +""""Module for building word-parse pairs from DELAF-PB, converting entries in the format + +agendas,agenda.N:fp + +to MorphoBr format: + +agendas agenda+N+F+PL + +Ouput files have the extension .mbr. +Entries in the MBR format can be converted to spaced-text with the BuildSpacedText.py module. Spaced-text, in turn, compiles into lexical transducers with XFST or FOMA, using the respective spaced-text compilers. +Tag conversions are based on the mappings defined in the dictionaries stored in the global variables TAG_MAPPING and CLITIC_MAPPING. Put these files in the directory specified in DATA_DIR or edit this variable to point to where these files are in your system. + +The regular expression pattern in the global variable SEPARATOR is used for splitting +entries in its component parts, i.e. word form, lemma, lexical category and morpho-syntactic features, e.g.: + +mantinham manter V I3p + +This separator also handles incorrectly encoded entries +from DELAF-PB like the following, where the first colon is spurious: + +mantinhas:,manter.V:I2s + +A spurious colon in cases like abstinhas:-lhe,abster.V+PRO:I2s is eliminated by the function CorrectEntry() +""" + +import re,sys +from os.path import join,expanduser +from cPickle import load,dump + +DATA_DIR=expanduser("~/morphtools_data") +TAGS="tag_mapping" +CLITICS="clitics" +PATH_TO_MAPPING_FILE=join(DATA_DIR,"%s.%s" % (TAGS,"txt")) +PATH_TO_MAPPING_FILE2=join(DATA_DIR,"%s.%s" % (CLITICS,"txt")) +EXTENSION="mbr" # output file extension +SEPARATOR=r"[,.:]+" +PRO="PRO" + +def UnpickleMapping(infile): + f=open(infile,"rb") + dic=load(f) + f.close() + return dic + +TAG_MAPPING=UnpickleMapping(join(DATA_DIR,"%s.%s" % (TAGS,"pkl"))) +CLITIC_MAPPING=UnpickleMapping(join(DATA_DIR,"%s.%s" % (CLITICS,"pkl"))) + +def OpenFile(filename): + return open(filename,"rU") + +def IgnoreLine(line): + return len(line.strip()) > 0 and not line.strip().startswith("#") + +def ExtractEntries(file): + return [line.strip().decode("utf-8") for line in file if IgnoreLine(line)] + +def ExtractMapping(filename=PATH_TO_MAPPING_FILE): + return ExtractEntries(OpenFile(filename)) + +def ConvertTagsetFromFile(filename=PATH_TO_MAPPING_FILE): + tag_mapping=dict() + lines=ExtractMapping(filename) + for line in lines: + tags=re.split(r"\s+",line)[:2] + tag_mapping[tags[0]]=tags[1] + return tag_mapping + +def PickleMapping(mapping,outfile): + f=open(outfile,"wb") + dump(mapping,f,-1) + f.close() + +def BuildMappings(*infiles): + for f in infiles: + PickleMapping(ConvertTagsetFromFile(f),"%s.pkl" % f.split(".")[0]) + +def HasClitic(entry): + return "+%s+" % PRO in entry + +def ExtractClitic(word): + for k in CLITIC_MAPPING.keys(): + # a clitic immediately follows a hyphen in word end position in enclisis or + # appears between hyphens in the case of mesoclisis + if re.search(r"-%s(-|\b)" % k, word): + return k + +def EndsInNasalDiphthong(word): + pattern=r".+([õã][eo]\b|m\b)".decode("utf-8") + if re.match(pattern,word): + return True + else: + return False + +def AppendCliticFeatures(word,cat): + return "%s.%s" % (cat, CLITIC_MAPPING.get(ExtractClitic(word))) + +def ConcatenateFeatures(feats): + return "+".join(feats) + +def ConvertFeatures(feats,dic=TAG_MAPPING): + return [dic.get(f,f) for f in feats] + +def CorrectEntry(entry): + "Eliminate spurious colon in cases like abstinhas:-lhe,abster.V+PRO:I2s" + error=":-" + corr="-" + if error in entry: + entry=entry.replace(error,corr) + return entry + +def EditEntry(entry): + "Substitute ambiguous letter S, which represents both the superlative of adjectives and the present subjunctive of verbs, for E in the former case." + return re.sub(r":S([mf][sp]$)",r":E\1",entry) + +def ParseEntry(entry,sep=SEPARATOR): + return re.split(sep,EditEntry(CorrectEntry(entry))) + +def AnnotateClitic(word,lemma,cat,feats): + """Substitutes the PRO tag in verb forms with an enclitic or mesoclitic pronoun for tags representing the lemma and morpho-syntactic features of these pronouns. Thus, conversions like the following are performed: + +degustares-lhe degustar+V+PRO+INF+2+SG +degustares-lhe degustar+V.ele.DAT.3.SG+SBJF+2+SG + +The ambiguity of forms with the clitic pronoun "nos" is handled by means of the tag mapping CLITIC_MAPPING + +nos ele.ACC.3.M.PL/nós.AD.1.PL + +Thus,an ambiguous entry like + +vindimam-nos vindimar+V+PRO+PRS+3+PL + +is split into two separate entries + +vindimam-nos vindimar+V.ele.ACC.3.M.PL+PRS+3+PL +vindimam-nos vindimar+V.nós.AD.1.PL+PRS+3+PL + +In forms with a non-ambiguous "nos", the clitic is mapped to AD.1.PL+PRS+3+PL. Thus, an entry like + +vindicávei-nos vindicar+V+PRO+IMPF+2+PL + +is converted to + +vindicávamo-nos vindicar+nós.AD.1.PL+IMPF+1+PL + + """ + # if clitic maps to "ele.ACC.3.M.PL/nós.AD.1.PL", then the length of cat_list is 2; + # else, it is 1 + cat_list= AppendCliticFeatures(word,cat).split("/") + # if the clitic has two feature sets, then the ambiguity of clitic "nos" + # after nasal diphthong must be handled + if len(cat_list)==2: + # inserting category label into second list of features + cat_list[1]="%s.%s" % (cat_list[0].split(".")[0],cat_list[1]) + # if this condition is true, then clitic has two readings; + # else it only has the second reading + if EndsInNasalDiphthong(word): + # print word + entries=[] + for c in cat_list: + entries.append("%s\t%s+%s+%s" % (word,lemma,c,ConcatenateFeatures(feats))) + return "\n".join(entries) + else: + cat=cat_list[1] + else: + cat=cat_list[0] + return "%s\t%s+%s+%s" % (word,lemma,cat,ConcatenateFeatures(feats)) + +def toString(word,lemma,cat,feats): + return "%s\t%s+%s+%s" % (word, + lemma, + cat, + ConcatenateFeatures(feats)) + +def ConvertEntry(entry): + parts=ParseEntry(entry) + # if cat is an uninflected category like ADV + if len(parts) == 3: + word,lemma,cat=parts + return "%s\t%s+%s" % (word,lemma,cat) + # if entry is split in less than 3 or more than 4 parts, + # then there must be something wrong in entry (e.g. due to bad formating) + if len(parts) != 4: + print entry + return None + word,lemma,cat,feats=parts + return toString(word,lemma,cat,ConvertFeatures(feats)) + +def ConvertEntriesFromFile(filename): + entries=ExtractEntries(OpenFile(filename)) + return [ConvertEntry(e) for e in entries] + +def WriteFile(entries,infile): + f=open("%s.%s" % (infile,EXTENSION), "w") + for e in entries: + f.write("%s\n" % e.encode("utf-8")) + f.close() + +def main(): + for file in sys.argv[1:]: + WriteFile(ConvertEntriesFromFile(file),file) + +if __name__ == '__main__': + main() From 8465a8f236ebb637fabaffc6e71f154fbd70d859 Mon Sep 17 00:00:00 2001 From: "Leonel F. de Alencar" Date: Thu, 28 Jun 2018 11:58:29 -0300 Subject: [PATCH 2/3] #53 new tag mapping files --- tools/python-converter/tag_mapping.pkl | Bin 312 -> 328 bytes tools/python-converter/tag_mapping.txt | 10 +++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/python-converter/tag_mapping.pkl b/tools/python-converter/tag_mapping.pkl index 314819a81bf893916ffa6a86257110e584a63870..bc740d5347f67f6ee506447629c5f274cdf2a2ea 100644 GIT binary patch delta 21 ccmdnNbb@Ju54URsD+2>VaA<&Q&_wr-076~{W&i*H delta 9 QcmX@Xw1a7a&%~HF027P^IsgCw diff --git a/tools/python-converter/tag_mapping.txt b/tools/python-converter/tag_mapping.txt index 0e380af0..30f6a4eb 100644 --- a/tools/python-converter/tag_mapping.txt +++ b/tools/python-converter/tag_mapping.txt @@ -1,9 +1,13 @@ -# this table is used for building a Python dictionary with the -# module BuildPairsFromDELAF +# Convertion table between the DELAF_PB computational dictionary and the BrMorph tagsets + +# Ambiguous letter S, which represents both the superlative of adjectives and the present subjunctive of verbs, +# is substituted for E in the former case. +# For information on DELAF_PB, visit: http://www.nilc.icmc.usp.br/nilc/projects/unitex-pb/web/dicionarios.html +# BrMorph generally follows the format proposed by Beesley and Karttunen's XFST book, using a tagset based on the Leipzig Glossing Rules and other abbrevations for morphological glosses commonly adopted in the linguistic literature: http://www.llf.cnrs.fr/sites/llf.cnrs.fr/files/statiques/Abreviations_gloses-fra.pdf D DIM diminutive A AUG augmentative -S SUPER superlative +E SUPER superlative m M masculine s SG singular f F feminine From 1fb3897e73bcc19a0f96d161597ebb4c791c1c12 Mon Sep 17 00:00:00 2001 From: "Leonel F. de Alencar" Date: Mon, 2 Jul 2018 12:39:59 -0300 Subject: [PATCH 3/3] #42 #48 #53 Python module for separating clitics from verbs; minor changes in the other Python modules --- tools/prepare-delaf.sh | 2 +- .../AnnotateClitics.py | 9 +++-- .../ConvertDELAF.py | 10 +++++ tools/python-tools/SeparateHyphen.py | 36 ++++++++++++++++++ .../clitics.pkl | Bin .../clitics.txt | 0 .../tag_mapping.pkl | Bin .../tag_mapping.txt | 0 8 files changed, 52 insertions(+), 5 deletions(-) rename tools/{python-converter => python-tools}/AnnotateClitics.py (81%) rename tools/{python-converter => python-tools}/ConvertDELAF.py (90%) create mode 100755 tools/python-tools/SeparateHyphen.py rename tools/{python-converter => python-tools}/clitics.pkl (100%) rename tools/{python-converter => python-tools}/clitics.txt (100%) rename tools/{python-converter => python-tools}/tag_mapping.pkl (100%) rename tools/{python-converter => python-tools}/tag_mapping.txt (100%) diff --git a/tools/prepare-delaf.sh b/tools/prepare-delaf.sh index 25d72a14..5d1b8e37 100644 --- a/tools/prepare-delaf.sh +++ b/tools/prepare-delaf.sh @@ -30,5 +30,5 @@ grep -F ".N:" $1 | # select nouns grep -F ".V+PRO:" $1 | # select verbs with clitics # rm spurious colon like in abstinhas:-lhe,abster.V+PRO:I2s sed "s/:-/-/" | - splitW31 > delaf.clitics + splitW31 | SeparateHyphen.py > delaf.clitics diff --git a/tools/python-converter/AnnotateClitics.py b/tools/python-tools/AnnotateClitics.py similarity index 81% rename from tools/python-converter/AnnotateClitics.py rename to tools/python-tools/AnnotateClitics.py index 7520eb31..98760637 100755 --- a/tools/python-converter/AnnotateClitics.py +++ b/tools/python-tools/AnnotateClitics.py @@ -3,7 +3,7 @@ # Author: Leonel Figueiredo de Alencar - Federal University of Ceará # leonel.de.alencar@ufc.br -# Date: June 27, 2018 +# Date: July 2, 2018 """ This module annotates enclitic or mesoclitic pronouns in entries in the MBR format @@ -21,7 +21,7 @@ degustares-lhe degustar+V.ele.DAT.3.SG+SBJF+2+SG Tag conversion is performed by the AnnotateClitic function from -the module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled. +module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled. For more details, see the respective module documentation. """ import sys @@ -34,8 +34,9 @@ def main(): if HasClitic(entry): parts=ParseEntry(entry,r"\t|\+") word,lemma,cat,feats=parts[0],parts[1],parts[2],parts[4:] - print AnnotateClitic(word,lemma,cat,feats).encode("utf-8") + sys.stdout.write("%s\n" % AnnotateClitic(word,lemma,cat,feats).encode("utf-8")) else: - print entry.encode("utf-8") + sys.stdout.write("%s\n" % entry.encode("utf-8")) + if __name__ == '__main__': main() diff --git a/tools/python-converter/ConvertDELAF.py b/tools/python-tools/ConvertDELAF.py similarity index 90% rename from tools/python-converter/ConvertDELAF.py rename to tools/python-tools/ConvertDELAF.py index 47f2eb9d..78d1263b 100755 --- a/tools/python-converter/ConvertDELAF.py +++ b/tools/python-tools/ConvertDELAF.py @@ -44,6 +44,9 @@ EXTENSION="mbr" # output file extension SEPARATOR=r"[,.:]+" PRO="PRO" +# PATTERN=r"(^[^-]+)(vos|n[oa]s?|l[oa]s?|lhes?|me|se|te)(,.+\V\+PRO)" +PATTERN1=r"(^[^-]+)(vos|n[oa]s?|l[oa]s?|lhes?|me|se|te)(,)" +PATTERN2=r"(^[^-]+)([oa]s?)(,)" def UnpickleMapping(infile): f=open(infile,"rb") @@ -109,6 +112,13 @@ def ConcatenateFeatures(feats): def ConvertFeatures(feats,dic=TAG_MAPPING): return [dic.get(f,f) for f in feats] +def SeparateClitic(entry): + """Separate clitic from verb form in entries like abluirlhe,abluir.V+PRO:U1s, + returning entries like abluir-lhe,abluir.V+PRO:U1s. Clitic separation is performed in two steps: first, clitics beginning with a consonant are separated; then, clitics beginning with a vowel are separated. This is necessary to prevent unwanted separations like +zuirn-os,zuir.V+PRO:W3s instead of zuir-nos,zuir.V+PRO:W3s, since the form of latter +type clitics are contained in the ones of the former.""" + return re.sub(PATTERN2,r"\1-\2\3",re.sub(PATTERN1,r"\1-\2\3",entry)) + def CorrectEntry(entry): "Eliminate spurious colon in cases like abstinhas:-lhe,abster.V+PRO:I2s" error=":-" diff --git a/tools/python-tools/SeparateHyphen.py b/tools/python-tools/SeparateHyphen.py new file mode 100755 index 00000000..aef91574 --- /dev/null +++ b/tools/python-tools/SeparateHyphen.py @@ -0,0 +1,36 @@ +#! /usr/bin/env python2.7 +# -*- coding: utf-8 -*- + +# Author: Leonel Figueiredo de Alencar - Federal University of Ceará +# leonel.de.alencar@ufc.br +# Date: July 2, 2018 + +""" +This module correct DELAF entries with the V+PRO tag from standard input +by inserting the missing hyphen separating the clitic pronoun from +the verb form in entries like the following: + +abluirlhe,abluir.V+PRO:U1s + +The output are correct entries, e.g.: + +abluir-lhe,abluir.V+PRO:U1s + +Usage: cat INFILE | SeparateHyphen.py > OUTFILE + +The module uses the SeparateClitic function from module ConvertDELAF. +Clitic separation is performed using PATTERN1, which presupposes that the +input entries contain the V+PRO tag. +""" + +import sys +from ConvertDELAF import * + + +def main(): + entries=ExtractEntries(sys.stdin) + for entry in entries: + sys.stdout.write("%s\n" % SeparateClitic(entry).encode("utf-8")) + +if __name__ == '__main__': + main() diff --git a/tools/python-converter/clitics.pkl b/tools/python-tools/clitics.pkl similarity index 100% rename from tools/python-converter/clitics.pkl rename to tools/python-tools/clitics.pkl diff --git a/tools/python-converter/clitics.txt b/tools/python-tools/clitics.txt similarity index 100% rename from tools/python-converter/clitics.txt rename to tools/python-tools/clitics.txt diff --git a/tools/python-converter/tag_mapping.pkl b/tools/python-tools/tag_mapping.pkl similarity index 100% rename from tools/python-converter/tag_mapping.pkl rename to tools/python-tools/tag_mapping.pkl diff --git a/tools/python-converter/tag_mapping.txt b/tools/python-tools/tag_mapping.txt similarity index 100% rename from tools/python-converter/tag_mapping.txt rename to tools/python-tools/tag_mapping.txt