compholio
diff --git a/‎download-abbrv.sh
+2-6 b/‎download-abbrv.sh
+2-6
diff --git a/‎extract-abbrv-lang.sh
+50-38 b/‎extract-abbrv-lang.sh
+50-38
@@ -4,17 +4,15 @@
 ## Pass "redownload" as the first flag to redownload the LTWA, ie:
 ## 	./download-abbrv.sh redownload
 
+LTWA_URL="https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt";
 ISSN_LANGUAGES=("mul" "eng" "ger" "fre" "spa");
 UNIX_LANGUAGES=("all" "en"  "de"  "fr"  "es" );
 MAXLANG=$((${#ISSN_LANGUAGES[@]}-1));
 
 if [ ! -f lang_data.txt ] || [ "$1" = "redownload" ]; then
 	echo "Downloading LTWA Database...";
 	rm lang_data.txt 2> /dev/null;
-	for LETTER in `echo {a..z}`; do
-		ENTRIES=`wget -O - http://www.issn.org/2-22661-LTWA-online.php?letter=${LETTER} | grep '<tr><td>' | grep -v 'n.a.' | sed 's/\///g' | sed 's/<tr>//g' | sed 's/<td><td>/<td>/g' | sed 's/,\ /,/g' | sed 's/\ /\&nbsp;/g'`;
-		echo "${ENTRIES}" >> lang_data.txt;
-	done
+	wget --no-check-certificate -O - "${LTWA_URL}" | iconv -f UTF-16LE -t UTF-8 > lang_data.txt;
 fi
 
 for I in `seq 0 ${MAXLANG}`; do
@@ -23,5 +21,3 @@ for I in `seq 0 ${MAXLANG}`; do
 	echo "Working on Language '${UNIX_LANG}'...";
 	. ./extract-abbrv-lang.sh;
 done
-#cp *.ldf ../jabbrv/;
-
@@ -3,52 +3,54 @@
 # NOTE: Expects ISSN_LANG and UNIX_LANG to be defined
 OUTPUT_FILE="jabbrv-ltwa-${UNIX_LANG}.ldf";
 
-# Below is a rather large list of LaTeX replacements for UTF-8 Characters,
-# note that only characters found in the LTWA database are included below
+# Below is a list of LaTeX replacements for UTF-8 combining diacritical marks and unusual symbols.
+# Note that only characters found in the LTWA database are included below
 # (to save processing time for my poor computer)
 #            \`              \'              \^              \~              \=
 REPLACECHAR=('\(.\)\xCC\x80' '\(.\)\xCC\x81' '\(.\)\xCC\x82' '\(.\)\xCC\x83' '\(.\)\xCC\x84' \
+#            \.
+             '\(.\)\xCC\x87'                                                                 \
 #            \"              \v              \J@C            \c
              '\(.\)\xCC\x88' '\(.\)\xCC\x8C' '\(.\)\xCC\xA1' '\(.\)\xCC\xA7'                 \
-#            \=a             \"a             \^A
-             'ā'             'ä'             'Â'                                             \
-#            \'e
-             'é'                                                                             \
-#            \`i             \'i
-             'ì'             'í'                                                             \
-#            \'o             \oe             \"o
-             'ó'             'œ'             'ö'                                             \
-#            \'u             \^u             \"u
-             'ú'             'û'             'ü'                                             \
-#            \v s
-             'š'                                                                             \
+#            \oe
+             'œ'                                                                             \
              'ʹ');
 REPLACEMENT=('\\`\1'         "\\\\'\1"       '\\^\1'         '\\~\1'         '\\=\1'         \
+             '\\.\1'                                                                         \
              '\\"\1'         '\\v \1'        '\\J@C \1'      '\\c \1'                        \
-             '\\=a'          '\\"a'          '\\^A'                                          \
-             "\\\\'e"                                                                        \
-             '\\`i'          "\\\\'i"                                                        \
-             "\\\\'o"        '\\oe '         '\\"o'                                          \
-             "\\\\'u"        '\\^u'          '\\"u'                                          \
-             '\\v s'                                                                         \
+             '\\oe '                                                                         \
              "'");
 REPLACE_RULES="";
 MAXRULES=$((${#REPLACECHAR[@]}-1));
 for J in `seq 0 ${MAXRULES}`; do
 	REPLACE_RULES="${REPLACE_RULES};s/${REPLACECHAR[$J]}/${REPLACEMENT[$J]}/g";
 done
-#REPLACE_RULES="${REPLACE_RULES:1}";
-# For testing:
-#echo "${REPLACE_RULES}";
-#TEST="<td>èlement-<td>elem.<td>rus,fre,eng<td>";
-#TEST=`echo "${TEST}" | awk 'BEGIN { FS = "<td>" } ; { print $2 }'`;
-#TEST="èpidemiolog- ébénisterie";
-#TEST=`echo "${TEST:0:1}" | tr a-z A-Z`"${TEST:1}";
-#echo "${TEST}" | sed -e "${REPLACE_RULES:1}";
-#exit 0;
 
-HEADER="%% Copyright 2010 Erich Hoover
-%% E-mail: [email protected]
+REPLACE_ODD="";
+# almost the entire LTWA uses "combining" diacritical marks, except for limited instances of:
+REPLACE_ODD="${REPLACE_ODD};s/Â/A\xCC\x82/g"; # capital A with circumflex (Â)
+REPLACE_ODD="${REPLACE_ODD};s/ā/a\xCC\x84/g"; # lowercase a with overline (ā)
+REPLACE_ODD="${REPLACE_ODD};s/ä/a\xCC\x88/g"; # lowercase a with umlauts (ä)
+REPLACE_ODD="${REPLACE_ODD};s/è/e\xCC\x80/g"; # lowercase e with backtick (è)
+REPLACE_ODD="${REPLACE_ODD};s/é/e\xCC\x81/g"; # lowercase e with forward tick (é)
+REPLACE_ODD="${REPLACE_ODD};s/ì/i\xCC\x80/g"; # lowercase i with backtick (ì)
+REPLACE_ODD="${REPLACE_ODD};s/í/i\xCC\x81/g"; # lowercase i with forward tick (í)
+REPLACE_ODD="${REPLACE_ODD};s/Ö/O\xCC\x88/g"; # capital O with umlauts (Ö)
+REPLACE_ODD="${REPLACE_ODD};s/ó/o\xCC\x81/g"; # lowercase o with forward tick (ó)
+REPLACE_ODD="${REPLACE_ODD};s/ö/o\xCC\x88/g"; # lowercase o with umlauts (ö)
+REPLACE_ODD="${REPLACE_ODD};s/ú/u\xCC\x81/g"; # lowercase u with forward tick (ú)
+REPLACE_ODD="${REPLACE_ODD};s/û/u\xCC\x82/g"; # lowercase u with circumflex (û)
+REPLACE_ODD="${REPLACE_ODD};s/ü/u\xCC\x88/g"; # lowercase u with umlauts (ü)
+REPLACE_ODD="${REPLACE_ODD};s/š/s\xCC\x8C/g"; # lowercase s with caron (š)
+
+# remove all the "Not Applicable" entries from the list
+REPLACE_NA="/.*\tn.a.\t.*/d";
+
+# remove all the entries that start with a dash or a single quote
+REPLACE_NONLETTER="/^[-']/d";
+
+HEADER="%% Copyright 2010-2019 Erich E. Hoover
+%% E-mail: [email protected]
 %% 
 %% =============================================
 %% IMPORTANT NOTICE:
@@ -60,23 +62,32 @@ HEADER="%% Copyright 2010 Erich Hoover
 %%   http://www.latex-project.org/lppl.txt
 %% =============================================
 %% The List of Title Word Abbreviations below is automatically
-%% generatedfrom the ISSN LTWA database, publicly accessible from
+%% generated from the ISSN LTWA database, publicly accessible from
 %% their website:
 %%   http://www.issn.org/2-22660-LTWA.php
 ";
 echo "${HEADER}" > ${OUTPUT_FILE};
-ENTRIES=`cat lang_data.txt`;
+ENTRIES=$(cat lang_data.txt | sed -e "${REPLACE_ODD};${REPLACE_NA};${REPLACE_NONLETTER}");
+I=0;
+export IFS=$'\r\n'
 for ENTRY in ${ENTRIES}; do
+    I=$((I+1));
+    if [ "${I}" -eq "1" ]; then continue; fi
 	# Remove punctuation:
 	ENTRY=`echo "${ENTRY}" | sed 's/\.//g'`;
-	# Pull out the applicable languages for testing:
-	LANGS=`echo "${ENTRY}" | awk 'BEGIN { FS = "<td>" } ; { print $4 }'`;
+	# Pull out the applicable languages, title, and abbreviation:
+    OLDIFS=${IFS}
+    export IFS=$'\t'
+    while [ 1 ]; do
+        read TITLE ABBRV LANGS;
+        break;
+    done < <(echo "${ENTRY}")
+    export IFS=${OLDIFS}
 	# See if one of the languages is the one we're interested in outputting
+    OLDIFS=${IFS}
+    export IFS=' '
 	for ELANG in `echo "${LANGS}" | sed 's/,/\ /g'`; do
 		if [ "${ELANG}" = "${ISSN_LANG}" ]; then
-			# Pull out the title and abbreviation:
-			TITLE=`echo "${ENTRY}" | awk 'BEGIN { FS = "<td>" } ; { print $2 }'`;
-			ABBRV=`echo "${ENTRY}" | awk 'BEGIN { FS = "<td>" } ; { print $3 }'`;
 			# Capitalize the first letter of the title and the abbreviation"
 			TITLE=`echo "${TITLE:0:1}" | tr a-z A-Z`"${TITLE:1}";
 			ABBRV=`echo "${ABBRV:0:1}" | tr a-z A-Z`"${ABBRV:1}";
@@ -96,5 +107,6 @@ for ENTRY in ${ENTRIES}; do
 			break;
 		fi
 	done
+    export IFS=${OLDIFS}
 done