Skip to content

Commit

Permalink
(#53) merge Leonel's work
Browse files Browse the repository at this point in the history
  • Loading branch information
odanoburu committed Jul 3, 2018
2 parents 54f1bd0 + 1fb3897 commit 0cc197e
Show file tree
Hide file tree
Showing 12 changed files with 338 additions and 198 deletions.
9 changes: 6 additions & 3 deletions tools/bootstrap/prepare-delaf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
# tr -d '\r' < Delaf2015v04.dic > delaf.dic

function splitW31 {
# split entries what end in W31 but should be two entries, one
# with W3s and the other with W1s
sed "s/\\(.*W\\)31$/\\13s\\n\\11s/"
}

Expand All @@ -28,6 +30,7 @@ grep -F ".N:" "$1" | # select nouns

# verbs with clitics
grep -F ".V+PRO:" "$1" | # select verbs with clitics
# rm spurious colon like in abstinhas:-lhe,abster.V+PRO:I2s
sed "s/:-/-/" |
splitW31 > delaf.clitics
sed "s/:-/-/" | # rm spurious colon like in abstinhas:-lhe,abster.V+PRO:I2s
splitW31 |
# add hyphen where it should exist amarnos -> amar-nos
SeparateHyphen.py > delaf.clitics
2 changes: 1 addition & 1 deletion tools/convert-format/MorphoMbr.gf
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ concrete MorphoMbr of Morpho = open Prelude, Predef in {
mkEntry fo l fs = ss (fo.s ++ "&t" ++ l.s ++ fs.s) ;

clitic : Str ;
clitic = personstr ++ mode ++ optStr personnum ++ optStr gender ++ optStr number ;`
clitic = personstr ++ mode ++ optStr personnum ++ optStr gender ++ optStr number ;

personstr, mode, personnum, gender, number : Str ;
personstr = "." ++ ("ele" | "vós" | "nós" | "eu" | "tu") ;
Expand Down
167 changes: 0 additions & 167 deletions tools/python-converter/BuildPairsFromDELAF.py

This file was deleted.

Binary file removed tools/python-converter/tag_mapping.pkl
Binary file not shown.
27 changes: 0 additions & 27 deletions tools/python-converter/tag_mapping.txt

This file was deleted.

42 changes: 42 additions & 0 deletions tools/python-tools/AnnotateClitics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#! /usr/bin/env python2.7
# -*- coding: utf-8 -*-

# Author: Leonel Figueiredo de Alencar - Federal University of Ceará
# [email protected]
# Date: July 2, 2018
"""
This module annotates enclitic or mesoclitic pronouns in entries in the MBR format
Usage: cat infile.mbr | AnnotateClitics.py > outfile.mbr
It reads entries in the MBR format from standard input and substitutes
the +PRO tag for tags representing the lemma and morpho-syntactic features
of the pronouns. The result is written to standard output.
For example, an entry like
degustares-lhe degustar+V+PRO+INF+2+SG
is converted to
degustares-lhe degustar+V.ele.DAT.3.SG+SBJF+2+SG
Tag conversion is performed by the AnnotateClitic function from
module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled.
For more details, see the respective module documentation.
"""
import sys
from ConvertDELAF import *


def main():
entries=ExtractEntries(sys.stdin)
for entry in entries:
if HasClitic(entry):
parts=ParseEntry(entry,r"\t|\+")
word,lemma,cat,feats=parts[0],parts[1],parts[2],parts[4:]
sys.stdout.write("%s\n" % AnnotateClitic(word,lemma,cat,feats).encode("utf-8"))
else:
sys.stdout.write("%s\n" % entry.encode("utf-8"))

if __name__ == '__main__':
main()
Loading

0 comments on commit 0cc197e

Please sign in to comment.