Skip to content

Commit 65d745c

Browse files
committed
Update the abbreviations to work off the ISSN LTWA 2016-09-15 CSV file
1 parent d689043 commit 65d745c

7 files changed

+1915
-1117
lines changed

download-abbrv.sh

+2-6
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,15 @@
44
## Pass "redownload" as the first flag to redownload the LTWA, ie:
55
## ./download-abbrv.sh redownload
66

7+
LTWA_URL="https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt";
78
ISSN_LANGUAGES=("mul" "eng" "ger" "fre" "spa");
89
UNIX_LANGUAGES=("all" "en" "de" "fr" "es" );
910
MAXLANG=$((${#ISSN_LANGUAGES[@]}-1));
1011

1112
if [ ! -f lang_data.txt ] || [ "$1" = "redownload" ]; then
1213
echo "Downloading LTWA Database...";
1314
rm lang_data.txt 2> /dev/null;
14-
for LETTER in `echo {a..z}`; do
15-
ENTRIES=`wget -O - http://www.issn.org/2-22661-LTWA-online.php?letter=${LETTER} | grep '<tr><td>' | grep -v 'n.a.' | sed 's/\///g' | sed 's/<tr>//g' | sed 's/<td><td>/<td>/g' | sed 's/,\ /,/g' | sed 's/\ /\&nbsp;/g'`;
16-
echo "${ENTRIES}" >> lang_data.txt;
17-
done
15+
wget --no-check-certificate -O - "${LTWA_URL}" | iconv -f UTF-16LE -t UTF-8 > lang_data.txt;
1816
fi
1917

2018
for I in `seq 0 ${MAXLANG}`; do
@@ -23,5 +21,3 @@ for I in `seq 0 ${MAXLANG}`; do
2321
echo "Working on Language '${UNIX_LANG}'...";
2422
. ./extract-abbrv-lang.sh;
2523
done
26-
#cp *.ldf ../jabbrv/;
27-

extract-abbrv-lang.sh

+50-38
Original file line numberDiff line numberDiff line change
@@ -3,52 +3,54 @@
33
# NOTE: Expects ISSN_LANG and UNIX_LANG to be defined
44
OUTPUT_FILE="jabbrv-ltwa-${UNIX_LANG}.ldf";
55

6-
# Below is a rather large list of LaTeX replacements for UTF-8 Characters,
7-
# note that only characters found in the LTWA database are included below
6+
# Below is a list of LaTeX replacements for UTF-8 combining diacritical marks and unusual symbols.
7+
# Note that only characters found in the LTWA database are included below
88
# (to save processing time for my poor computer)
99
# \` \' \^ \~ \=
1010
REPLACECHAR=('\(.\)\xCC\x80' '\(.\)\xCC\x81' '\(.\)\xCC\x82' '\(.\)\xCC\x83' '\(.\)\xCC\x84' \
11+
# \.
12+
'\(.\)\xCC\x87' \
1113
# \" \v \J@C \c
1214
'\(.\)\xCC\x88' '\(.\)\xCC\x8C' '\(.\)\xCC\xA1' '\(.\)\xCC\xA7' \
13-
# \=a \"a \^A
14-
'ā' 'ä' 'Â' \
15-
# \'e
16-
'é' \
17-
# \`i \'i
18-
'ì' 'í' \
19-
# \'o \oe \"o
20-
'ó' 'œ' 'ö' \
21-
# \'u \^u \"u
22-
'ú' 'û' 'ü' \
23-
# \v s
24-
'š' \
15+
# \oe
16+
'œ' \
2517
'ʹ');
2618
REPLACEMENT=('\\`\1' "\\\\'\1" '\\^\1' '\\~\1' '\\=\1' \
19+
'\\.\1' \
2720
'\\"\1' '\\v \1' '\\J@C \1' '\\c \1' \
28-
'\\=a' '\\"a' '\\^A' \
29-
"\\\\'e" \
30-
'\\`i' "\\\\'i" \
31-
"\\\\'o" '\\oe ' '\\"o' \
32-
"\\\\'u" '\\^u' '\\"u' \
33-
'\\v s' \
21+
'\\oe ' \
3422
"'");
3523
REPLACE_RULES="";
3624
MAXRULES=$((${#REPLACECHAR[@]}-1));
3725
for J in `seq 0 ${MAXRULES}`; do
3826
REPLACE_RULES="${REPLACE_RULES};s/${REPLACECHAR[$J]}/${REPLACEMENT[$J]}/g";
3927
done
40-
#REPLACE_RULES="${REPLACE_RULES:1}";
41-
# For testing:
42-
#echo "${REPLACE_RULES}";
43-
#TEST="<td>èlement-<td>elem.<td>rus,fre,eng<td>";
44-
#TEST=`echo "${TEST}" | awk 'BEGIN { FS = "<td>" } ; { print $2 }'`;
45-
#TEST="èpidemiolog- ébénisterie";
46-
#TEST=`echo "${TEST:0:1}" | tr a-z A-Z`"${TEST:1}";
47-
#echo "${TEST}" | sed -e "${REPLACE_RULES:1}";
48-
#exit 0;
4928

50-
HEADER="%% Copyright 2010 Erich Hoover
51-
29+
REPLACE_ODD="";
30+
# almost the entire LTWA uses "combining" diacritical marks, except for limited instances of:
31+
REPLACE_ODD="${REPLACE_ODD};s/Â/A\xCC\x82/g"; # capital A with circumflex (Â)
32+
REPLACE_ODD="${REPLACE_ODD};s/ā/a\xCC\x84/g"; # lowercase a with overline (ā)
33+
REPLACE_ODD="${REPLACE_ODD};s/ä/a\xCC\x88/g"; # lowercase a with umlauts (ä)
34+
REPLACE_ODD="${REPLACE_ODD};s/è/e\xCC\x80/g"; # lowercase e with backtick (è)
35+
REPLACE_ODD="${REPLACE_ODD};s/é/e\xCC\x81/g"; # lowercase e with forward tick (é)
36+
REPLACE_ODD="${REPLACE_ODD};s/ì/i\xCC\x80/g"; # lowercase i with backtick (ì)
37+
REPLACE_ODD="${REPLACE_ODD};s/í/i\xCC\x81/g"; # lowercase i with forward tick (í)
38+
REPLACE_ODD="${REPLACE_ODD};s/Ö/O\xCC\x88/g"; # capital O with umlauts (Ö)
39+
REPLACE_ODD="${REPLACE_ODD};s/ó/o\xCC\x81/g"; # lowercase o with forward tick (ó)
40+
REPLACE_ODD="${REPLACE_ODD};s/ö/o\xCC\x88/g"; # lowercase o with umlauts (ö)
41+
REPLACE_ODD="${REPLACE_ODD};s/ú/u\xCC\x81/g"; # lowercase u with forward tick (ú)
42+
REPLACE_ODD="${REPLACE_ODD};s/û/u\xCC\x82/g"; # lowercase u with circumflex (û)
43+
REPLACE_ODD="${REPLACE_ODD};s/ü/u\xCC\x88/g"; # lowercase u with umlauts (ü)
44+
REPLACE_ODD="${REPLACE_ODD};s/š/s\xCC\x8C/g"; # lowercase s with caron (š)
45+
46+
# remove all the "Not Applicable" entries from the list
47+
REPLACE_NA="/.*\tn.a.\t.*/d";
48+
49+
# remove all the entries that start with a dash or a single quote
50+
REPLACE_NONLETTER="/^[-']/d";
51+
52+
HEADER="%% Copyright 2010-2019 Erich E. Hoover
53+
5254
%%
5355
%% =============================================
5456
%% IMPORTANT NOTICE:
@@ -60,23 +62,32 @@ HEADER="%% Copyright 2010 Erich Hoover
6062
%% http://www.latex-project.org/lppl.txt
6163
%% =============================================
6264
%% The List of Title Word Abbreviations below is automatically
63-
%% generatedfrom the ISSN LTWA database, publicly accessible from
65+
%% generated from the ISSN LTWA database, publicly accessible from
6466
%% their website:
6567
%% http://www.issn.org/2-22660-LTWA.php
6668
";
6769
echo "${HEADER}" > ${OUTPUT_FILE};
68-
ENTRIES=`cat lang_data.txt`;
70+
ENTRIES=$(cat lang_data.txt | sed -e "${REPLACE_ODD};${REPLACE_NA};${REPLACE_NONLETTER}");
71+
I=0;
72+
export IFS=$'\r\n'
6973
for ENTRY in ${ENTRIES}; do
74+
I=$((I+1));
75+
if [ "${I}" -eq "1" ]; then continue; fi
7076
# Remove punctuation:
7177
ENTRY=`echo "${ENTRY}" | sed 's/\.//g'`;
72-
# Pull out the applicable languages for testing:
73-
LANGS=`echo "${ENTRY}" | awk 'BEGIN { FS = "<td>" } ; { print $4 }'`;
78+
# Pull out the applicable languages, title, and abbreviation:
79+
OLDIFS=${IFS}
80+
export IFS=$'\t'
81+
while [ 1 ]; do
82+
read TITLE ABBRV LANGS;
83+
break;
84+
done < <(echo "${ENTRY}")
85+
export IFS=${OLDIFS}
7486
# See if one of the languages is the one we're interested in outputting
87+
OLDIFS=${IFS}
88+
export IFS=' '
7589
for ELANG in `echo "${LANGS}" | sed 's/,/\ /g'`; do
7690
if [ "${ELANG}" = "${ISSN_LANG}" ]; then
77-
# Pull out the title and abbreviation:
78-
TITLE=`echo "${ENTRY}" | awk 'BEGIN { FS = "<td>" } ; { print $2 }'`;
79-
ABBRV=`echo "${ENTRY}" | awk 'BEGIN { FS = "<td>" } ; { print $3 }'`;
8091
# Capitalize the first letter of the title and the abbreviation"
8192
TITLE=`echo "${TITLE:0:1}" | tr a-z A-Z`"${TITLE:1}";
8293
ABBRV=`echo "${ABBRV:0:1}" | tr a-z A-Z`"${ABBRV:1}";
@@ -96,5 +107,6 @@ for ENTRY in ${ENTRIES}; do
96107
break;
97108
fi
98109
done
110+
export IFS=${OLDIFS}
99111
done
100112

0 commit comments

Comments
 (0)