diff --git a/tools/prepare-delaf.sh b/tools/prepare-delaf.sh index 271c2174..25d72a14 100644 --- a/tools/prepare-delaf.sh +++ b/tools/prepare-delaf.sh @@ -1,7 +1,13 @@ #!/bin/bash +# first argument is path to delaf dictionary + # don't forget to change CRLF to LF! # tr -d '\r' < Delaf2015v04.dic > delaf.dic +function splitW31 { + sed "s/\(.*W\)31$/\13s\n\11s/" +} + # adjectives grep -F ".A:" $1 > delaf.adj @@ -13,13 +19,16 @@ grep -F ".N:" $1 | # select nouns #remove uppercase lemmas (Gloria) grep -v ",[A-Z]" > delaf.nouns -# simple verbs -grep -F ".V:" $1 | # select simple verbs - sed "s/:,/,/" | # rm entries like mantinhas:,manter.V:I2s -# sorri,sorrir.V:Y2S -> sorri,sorrir.V:Y2s - sed "s/2S$/2s/" > delaf.verbs +# # simple verbs + grep -F ".V:" $1 | # select simple verbs + sed "s/:,/,/" | # rm entries like mantinhas:,manter.V:I2s +# # sorri,sorrir.V:Y2S -> sorri,sorrir.V:Y2s + sed "s/2S$/2s/" | # split entries like abstrair,abstrair.V:W31 + splitW31 > delaf.verbs # verbs with clitics grep -F ".V+PRO:" $1 | # select verbs with clitics # rm spurious colon like in abstinhas:-lhe,abster.V+PRO:I2s - sed "s/:-/-/" > delaf.clitics + sed "s/:-/-/" | + splitW31 > delaf.clitics + diff --git a/tools/prepare-freeling.sh b/tools/prepare-freeling.sh new file mode 100644 index 00000000..af908a30 --- /dev/null +++ b/tools/prepare-freeling.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# first argument if path to directory where freeling dictionary files +# are, with their original names + +# adjectives +# change C as diminutive tag to D (as is in nouns) +sed "s/ AQC/ AQD/" adjs > fl.adjectives + +# adverbs +mv adv fl.adverbs + +# nouns +# correct nouns with wrong C tag such as habeas-corpus +sed "s/ NCMC/ NCMN/" nouns > fl.nouns + +# verbs +mv verbs fl.verb diff --git a/tools/upstream-problems.org b/tools/upstream-problems.org index 3232338d..c6f1d527 100644 --- a/tools/upstream-problems.org +++ b/tools/upstream-problems.org @@ -25,7 +25,8 @@ subjuntive instead of singular number: : sorri,sorrir.V:Y2S - several hundred forms of infinitive had been marked as having two - persons, where these should have been in two lines instead: + persons, where these should have been in two lines instead, and + should have a number tag: : abstrair,abstrair.V:W31 - several hundred forms missing hifen: : protrairnos protrair+V.None+SBJF+1+SG