forked from DivinumOfficium/divinum-officium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscribe.pl
114 lines (97 loc) · 3.25 KB
/
transcribe.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/perl -CO
use utf8;
use warnings;
use strict;
use FindBin;
use Encode;
$\ = "\n";
# The format of the accent table is an unordered set of lines of the form
# plaintextword accentedtextword
# with exactly one range of whitespace in between.
# The program looks up words from the left and replaces them with words from the right.
# If there's no match, but there is a match after downcases the initial letter of the
# source word, then the replacement is done and then its initial is upcased.
# This program works in UTF-8 only.
my $Bin = $FindBin::Bin;
my @accents;
open ACCENTS, '<:encoding(utf-8)', "$Bin/accent_table"
or die "Can't read $Bin/accent_table\n";
my %table;
{ local $/; %table = split(' ', <ACCENTS>); }
close ACCENTS;
my $convert = Encode::find_encoding('utf-8');
my $rule;
my $rank;
while ( my $line = <> )
{
chomp $line;
eval { $line = $convert->decode($line, Encode::FB_CROAK); 1 }
or die "transcribe: input not UTF-8 on line $.\n";
unless ( $rule || $rank || $line =~ /^ *[!&#\$\@\[]/ )
{
# Only transcribe the suffix text, not the prefix rules, whatever they are.
next unless $line =~ /^([^=]*=|.*{ *:[^{}]*})?([^={}]*)$/;
my $prefix = $1 ? $1 : '';
my $words = $2;
my @words = split(/([^\pL]+)/, $words);
my $n = 0;
for my $word ( @words )
{
# First word in some lines is special but unmarked.
next if $n == 0 && $word eq 'Benedictio';
next if $n == 0 && $word eq 'Absolutio';
next if $n == 0 && $word eq 'Antiphona';
# Denormalize accents but not case.
# This handles the difference between María and mária.
$word =~ tr/áéíóúÁÉÍÓÚ/aeiouAEIOU/;
$word =~ s/[æǽ]/ae/g;
$word =~ s/[ÆǼ]/Ae/g;
$word =~ s/œ/oe/g;
$word =~ s/Œ/Oe/g;
# Try case specific first.
if ( $table{$word} )
{
$word = $table{$word}
}
else
{
my $replacement = $word;
$replacement =~ tr/A-Z/a-z/;
my $lowered = $replacement ne $word;
$replacement = $table{$replacement};
if ( $replacement )
{
if ( $lowered )
{
my $a1 = substr($replacement,0,1);
$a1 =~ tr/a-záéíóúǽæ/A-ZÁÉÍÓÚǼÆ/;
$replacement = $a1 . substr($replacement,1);
}
$word = $replacement;
}
}
}
continue
{
$n = $n + 1
}
$line = join('', @words);
# The following are more often right than wrong, but sometimes wrong,
# since coeptus is coëptus, and aerus is aërus.
# Corrections should do in the accents_table.
$line =~ s/ae/æ/g;
$line =~ s/Ae/Æ/g;
$line =~ s/oe/œ/g;
$line =~ s/Oe/Œ/g;
$line = $prefix. $line;
}
else
{
$rule = ($line =~ /\[Rule\]/) || ($rule && $line !~ /^\[/);
$rank = ($line =~ /\[Rank\]/) || ($rank && $line !~ /^\[/);
}
}
continue
{
print $line;
}