diff --git a/CIDTables/IDENTITY-H-ABCDEE.cid b/CIDTables/IDENTITY-H-ABCDEE.cid index be975d1..1bfb9d0 100755 --- a/CIDTables/IDENTITY-H-ABCDEE.cid +++ b/CIDTables/IDENTITY-H-ABCDEE.cid @@ -1,99 +1,99 @@ - array - ( - 3 => ' ', - 19 => '0', - 20 => '1', - 21 => '2', - 22 => '3', - 23 => '4', - 24 => '5', - 25 => '6', - 26 => '7', - 27 => '8', - 28 => '9', - 0x0004 => 'A', - 0x000E => 'Ą', - 0x0011 => 'B', - 0x0012 => 'C', - 0x0018 => 'D', - 0x001C => 'E', - 0x0026 => 'F', - 0x0027 => 'G', - 0x002C => 'H', - 0x002F => 'I', - 0x003A => 'J', - 0x003C => 'K', - 0x003E => 'L', - 0x0042 => 'Ł', - 0x0044 => 'M', - 0x0045 => 'N', - 0x0046 => 'Ń', - 0x004B => 'O', - 0x004D => 'Ó', - 0x0057 => 'P', - 0x005A => 'R', - 0x005E => 'S', - 0x005F => 'Ś', - 0x0064 => 'T', - 0x0068 => 'U', - 0x0073 => 'V', - 0x0074 => 'W', - 0x007A => 'Y', - 0x007F => 'Z', - 0x0082 => 'Ż', - 0x0102 => 'a', - 0x010C => 'ą', - 0x010F => 'b', - 0x0110 => 'c', - 0x0111 => 'ć', - 0x011A => 'd', - 0x011E => 'e', - 0x0127 => 'ę', - 0x0128 => 'f', - 0x0150 => 'g', - 0x015a => 'h', - 0x015d => 'i', - 0x0169 => 'j', - 0x016C => 'k', - 0x016F => 'l', - 0x0173 => 'ł', - 0x0175 => 'm', - 0x0176 => 'n', - 0x0177 => 'ń', - 0x017d => 'o', - 0x017F => 'ó', - 0x0189 => 'p', - 0x018C => 'r', - 0x0190 => 's', - 0x0191 => 'ś', - 0x019A => 't', - 0x01B5 => 'u', - 0x01C1 => 'w', - 0x01C7 => 'y', - 0x01CC => 'z', - 0x01CD => 'ź', - 0x01CF => 'ż', - 0x0355 => ',', - 0x0356 => ';', - 0x0358 => '.', - 0x035F => '"', - 0x0363 => '"', - 0x0372 => '-', - 0x0374 => '-', - 0x037E => '(', - 0x037F => ')', - 0x03EC => '0', - 0x03ED => '1', - 0x03EE => '2', - 0x03EF => '3', - 0x03F0 => '4', - 0x03F1 => '5', - 0x03F2 => '6', - 0x03F3 => '7', - 0x03F4 => '8', - 0x03F5 => '9' - ) - ) ; + array + ( + 3 => ' ', + 19 => '0', + 20 => '1', + 21 => '2', + 22 => '3', + 23 => '4', + 24 => '5', + 25 => '6', + 26 => '7', + 27 => '8', + 28 => '9', + 0x0004 => 'A', + 0x000E => 'Ą', + 0x0011 => 'B', + 0x0012 => 'C', + 0x0018 => 'D', + 0x001C => 'E', + 0x0026 => 'F', + 0x0027 => 'G', + 0x002C => 'H', + 0x002F => 'I', + 0x003A => 'J', + 0x003C => 'K', + 0x003E => 'L', + 0x0042 => 'Ł', + 0x0044 => 'M', + 0x0045 => 'N', + 0x0046 => 'Ń', + 0x004B => 'O', + 0x004D => 'Ó', + 0x0057 => 'P', + 0x005A => 'R', + 0x005E => 'S', + 0x005F => 'Ś', + 0x0064 => 'T', + 0x0068 => 'U', + 0x0073 => 'V', + 0x0074 => 'W', + 0x007A => 'Y', + 0x007F => 'Z', + 0x0082 => 'Ż', + 0x0102 => 'a', + 0x010C => 'ą', + 0x010F => 'b', + 0x0110 => 'c', + 0x0111 => 'ć', + 0x011A => 'd', + 0x011E => 'e', + 0x0127 => 'ę', + 0x0128 => 'f', + 0x0150 => 'g', + 0x015a => 'h', + 0x015d => 'i', + 0x0169 => 'j', + 0x016C => 'k', + 0x016F => 'l', + 0x0173 => 'ł', + 0x0175 => 'm', + 0x0176 => 'n', + 0x0177 => 'ń', + 0x017d => 'o', + 0x017F => 'ó', + 0x0189 => 'p', + 0x018C => 'r', + 0x0190 => 's', + 0x0191 => 'ś', + 0x019A => 't', + 0x01B5 => 'u', + 0x01C1 => 'w', + 0x01C7 => 'y', + 0x01CC => 'z', + 0x01CD => 'ź', + 0x01CF => 'ż', + 0x0355 => ',', + 0x0356 => ';', + 0x0358 => '.', + 0x035F => '"', + 0x0363 => '"', + 0x0372 => '-', + 0x0374 => '-', + 0x037E => '(', + 0x037F => ')', + 0x03EC => '0', + 0x03ED => '1', + 0x03EE => '2', + 0x03EF => '3', + 0x03F0 => '4', + 0x03F1 => '5', + 0x03F2 => '6', + 0x03F3 => '7', + 0x03F4 => '8', + 0x03F5 => '9' + ) + ) ; diff --git a/CIDTables/IDENTITY-H-GQJGLM.cid b/CIDTables/IDENTITY-H-GQJGLM.cid index a2b1e86..9d6c04f 100755 --- a/CIDTables/IDENTITY-H-GQJGLM.cid +++ b/CIDTables/IDENTITY-H-GQJGLM.cid @@ -1,263 +1,263 @@ - array - ( - 0x0000 => self::UNKNOWN_CID, - 0x0001 => self::UNKNOWN_CID, - 0x0002 => self::UNKNOWN_CID, - 0x0003 => self::UNKNOWN_CID, - 0x0004 => ' ', - 0x0005 => self::UNKNOWN_CID, - 0x0006 => '"', - 0x0007 => self::UNKNOWN_CID, - 0x0008 => self::UNKNOWN_CID, - 0x0009 => '%', - 0x000A => self::UNKNOWN_CID, - 0x000B => self::UNKNOWN_CID, - 0x000C => '(', - 0x000D => ')', - 0x000E => '*', - 0x000F => '+', - 0x0010 => ',', - 0x0011 => '-', - 0x0012 => '.', - 0x0013 => '/', - 0x0014 => '0', - 0x0015 => '1', - 0x0016 => '2', - 0x0017 => '3', - 0x0018 => '4', - 0x0019 => '5', - 0x001A => '6', - 0x001B => '7', - 0x001C => '8', - 0x001D => '9', - 0x001E => ':', - 0x001F => self::UNKNOWN_CID, - 0x0020 => self::UNKNOWN_CID, - 0x0021 => self::UNKNOWN_CID, - 0x0022 => self::UNKNOWN_CID, - 0x0023 => '?', - 0x0024 => '@', - 0x0025 => 'A', - 0x0026 => 'B', - 0x0027 => 'C', - 0x0028 => 'D', - 0x0029 => 'E', - 0x002A => 'F', - 0x002B => 'G', - 0x002C => 'H', - 0x002D => 'I', - 0x002E => 'J', - 0x002F => 'K', - 0x0030 => 'L', - 0x0031 => 'M', - 0x0032 => 'N', - 0x0033 => 'O', - 0x0034 => 'P', - 0x0035 => 'Q', - 0x0036 => 'R', - 0x0037 => 'S', - 0x0038 => 'T', - 0x0039 => 'U', - 0x003A => 'V', - 0x003B => 'W', - 0x003C => 'X', - 0x003D => 'Y', - 0x003E => 'Z', - 0x003F => self::UNKNOWN_CID, - 0x0040 => self::UNKNOWN_CID, - 0x0041 => self::UNKNOWN_CID, - 0x0042 => self::UNKNOWN_CID, - 0x0043 => self::UNKNOWN_CID, - 0x0044 => self::UNKNOWN_CID, - 0x0045 => 'a', - 0x0046 => 'b', - 0x0047 => 'c', - 0x0048 => 'd', - 0x0049 => 'e', - 0x004A => 'f', - 0x004B => 'g', - 0x004C => 'h', - 0x004D => 'i', - 0x004E => 'j', - 0x004F => 'k', - 0x0050 => 'l', - 0x0051 => 'm', - 0x0052 => 'n', - 0x0053 => 'o', - 0x0054 => 'p', - 0x0055 => 'q', - 0x0056 => 'r', - 0x0057 => 's', - 0x0058 => 't', - 0x0059 => 'u', - 0x005A => 'v', - 0x005B => 'w', - 0x005C => 'x', - 0x005D => 'y', - 0x005E => 'z', - 0x005F => self::UNKNOWN_CID, - 0x0060 => self::UNKNOWN_CID, - 0x0061 => self::UNKNOWN_CID, - 0x0062 => self::UNKNOWN_CID, - 0x0063 => ' ', - 0x0064 => self::UNKNOWN_CID, - 0x0065 => self::UNKNOWN_CID, - 0x0066 => self::UNKNOWN_CID, - 0x0067 => self::UNKNOWN_CID, - 0x0068 => self::UNKNOWN_CID, - 0x0069 => self::UNKNOWN_CID, - 0x006A => self::UNKNOWN_CID, - 0x006B => self::UNKNOWN_CID, - 0x006C => self::UNKNOWN_CID, - 0x006D => self::UNKNOWN_CID, - 0x006E => self::UNKNOWN_CID, - 0x006F => self::UNKNOWN_CID, - 0x0070 => self::UNKNOWN_CID, - 0x0071 => self::UNKNOWN_CID, - 0x0072 => self::UNKNOWN_CID, - 0x0073 => self::UNKNOWN_CID, - 0x0074 => self::UNKNOWN_CID, - 0x0075 => self::UNKNOWN_CID, - 0x0076 => self::UNKNOWN_CID, - 0x0077 => self::UNKNOWN_CID, - 0x0078 => self::UNKNOWN_CID, - 0x0079 => self::UNKNOWN_CID, - 0x007A => self::UNKNOWN_CID, - 0x007B => self::UNKNOWN_CID, - 0x007C => self::UNKNOWN_CID, - 0x007D => "\xC2\xB0", - 0x007E => self::UNKNOWN_CID, - 0x007F => self::UNKNOWN_CID, - 0x0080 => self::UNKNOWN_CID, - 0x0081 => self::UNKNOWN_CID, - 0x0082 => self::UNKNOWN_CID, - 0x0083 => self::UNKNOWN_CID, - 0x0084 => self::UNKNOWN_CID, - 0x0085 => self::UNKNOWN_CID, - 0x0086 => self::UNKNOWN_CID, - 0x0087 => self::UNKNOWN_CID, - 0x0088 => self::UNKNOWN_CID, - 0x0089 => self::UNKNOWN_CID, - 0x008A => self::UNKNOWN_CID, - 0x008B => self::UNKNOWN_CID, - 0x008C => "\xC3\x89", - 0x008D => self::UNKNOWN_CID, - 0x008E => self::UNKNOWN_CID, - 0x008F => self::UNKNOWN_CID, - 0x0090 => self::UNKNOWN_CID, - 0x0091 => self::UNKNOWN_CID, - 0x0092 => self::UNKNOWN_CID, - 0x0093 => self::UNKNOWN_CID, - 0x0094 => self::UNKNOWN_CID, - 0x0095 => self::UNKNOWN_CID, - 0x0096 => "\xC3\x93", - 0x0097 => self::UNKNOWN_CID, - 0x0098 => self::UNKNOWN_CID, - 0x0099 => self::UNKNOWN_CID, - 0x009A => self::UNKNOWN_CID, - 0x009B => self::UNKNOWN_CID, - 0x009C => self::UNKNOWN_CID, - 0x009D => self::UNKNOWN_CID, - 0x009E => self::UNKNOWN_CID, - 0x009F => self::UNKNOWN_CID, - 0x00A0 => self::UNKNOWN_CID, - 0x00A1 => self::UNKNOWN_CID, - 0x00A2 => self::UNKNOWN_CID, - 0x00A3 => self::UNKNOWN_CID, - 0x00A4 => "\xC3\xA1", - 0x00A5 => self::UNKNOWN_CID, - 0x00A6 => self::UNKNOWN_CID, - 0x00A7 => self::UNKNOWN_CID, - 0x00A8 => self::UNKNOWN_CID, - 0x00A9 => self::UNKNOWN_CID, - 0x00AA => self::UNKNOWN_CID, - 0x00AB => self::UNKNOWN_CID, - 0x00AC => "\xC3\xA9", - 0x00AD => self::UNKNOWN_CID, - 0x00AE => self::UNKNOWN_CID, - 0x00AF => self::UNKNOWN_CID, - 0x00B0 => "\xC3\xAD", - 0x00B1 => self::UNKNOWN_CID, - 0x00B2 => self::UNKNOWN_CID, - 0x00B3 => self::UNKNOWN_CID, - 0x00B4 => "\xC3\xB1", - 0x00B5 => self::UNKNOWN_CID, - 0x00B6 => "\xC3\xB3", - 0x00B7 => self::UNKNOWN_CID, - 0x00B8 => self::UNKNOWN_CID, - 0x00B9 => self::UNKNOWN_CID, - 0x00BA => self::UNKNOWN_CID, - 0x00BB => self::UNKNOWN_CID, - 0x00BC => self::UNKNOWN_CID, - 0x00BD => "\xC3\xB4", - 0x00BE => self::UNKNOWN_CID, - 0x00BF => self::UNKNOWN_CID, - 0x00C0 => self::UNKNOWN_CID, - 0x00C1 => self::UNKNOWN_CID, - 0x00C2 => self::UNKNOWN_CID, - 0x00C3 => self::UNKNOWN_CID, - 0x00C4 => self::UNKNOWN_CID, - 0x00C5 => self::UNKNOWN_CID, - 0x00C6 => self::UNKNOWN_CID, - 0x00C7 => self::UNKNOWN_CID, - 0x00C8 => self::UNKNOWN_CID, - 0x00C9 => self::UNKNOWN_CID, - 0x00CA => self::UNKNOWN_CID, - 0x00CB => self::UNKNOWN_CID, - 0x00CC => self::UNKNOWN_CID, - 0x00CD => self::UNKNOWN_CID, - 0x00CE => self::UNKNOWN_CID, - 0x00CF => self::UNKNOWN_CID, - 0x00D0 => self::UNKNOWN_CID, - 0x00D1 => self::UNKNOWN_CID, - 0x00D2 => self::UNKNOWN_CID, - 0x00D3 => self::UNKNOWN_CID, - 0x00D4 => self::UNKNOWN_CID, - 0x00D5 => self::UNKNOWN_CID, - 0x00D6 => self::UNKNOWN_CID, - 0x00D7 => self::UNKNOWN_CID, - 0x00D8 => self::UNKNOWN_CID, - 0x00D9 => self::UNKNOWN_CID, - 0x00DA => self::UNKNOWN_CID, - 0x00DB => self::UNKNOWN_CID, - 0x00DC => self::UNKNOWN_CID, - 0x00DD => self::UNKNOWN_CID, - 0x00DE => self::UNKNOWN_CID, - 0x00DF => self::UNKNOWN_CID, - 0x00E0 => self::UNKNOWN_CID, - 0x00E1 => self::UNKNOWN_CID, - 0x00E2 => self::UNKNOWN_CID, - 0x00E3 => self::UNKNOWN_CID, - 0x00E4 => self::UNKNOWN_CID, - 0x00E5 => self::UNKNOWN_CID, - 0x00E6 => self::UNKNOWN_CID, - 0x00E7 => self::UNKNOWN_CID, - 0x00E8 => self::UNKNOWN_CID, - 0x00E9 => self::UNKNOWN_CID, - 0x00EA => self::UNKNOWN_CID, - 0x00EB => self::UNKNOWN_CID, - 0x00EC => self::UNKNOWN_CID, - 0x00ED => self::UNKNOWN_CID, - 0x00EE => self::UNKNOWN_CID, - 0x00EF => self::UNKNOWN_CID, - 0x00F0 => self::UNKNOWN_CID, - 0x00F1 => self::UNKNOWN_CID, - 0x00F2 => self::UNKNOWN_CID, - 0x00F3 => self::UNKNOWN_CID, - 0x00F4 => self::UNKNOWN_CID, - 0x00F5 => self::UNKNOWN_CID, - 0x00F6 => self::UNKNOWN_CID, - 0x00F7 => self::UNKNOWN_CID, - 0x00F8 => self::UNKNOWN_CID, - 0x00F9 => self::UNKNOWN_CID, - 0x00FA => self::UNKNOWN_CID, - 0x00FB => self::UNKNOWN_CID, - 0x00FC => self::UNKNOWN_CID, - 0x00FD => self::UNKNOWN_CID, - 0x00FE => self::UNKNOWN_CID, - 0x0192 => "\xE2\x82\xAC", - ) - ) ; + array + ( + 0x0000 => self::UNKNOWN_CID, + 0x0001 => self::UNKNOWN_CID, + 0x0002 => self::UNKNOWN_CID, + 0x0003 => self::UNKNOWN_CID, + 0x0004 => ' ', + 0x0005 => self::UNKNOWN_CID, + 0x0006 => '"', + 0x0007 => self::UNKNOWN_CID, + 0x0008 => self::UNKNOWN_CID, + 0x0009 => '%', + 0x000A => self::UNKNOWN_CID, + 0x000B => self::UNKNOWN_CID, + 0x000C => '(', + 0x000D => ')', + 0x000E => '*', + 0x000F => '+', + 0x0010 => ',', + 0x0011 => '-', + 0x0012 => '.', + 0x0013 => '/', + 0x0014 => '0', + 0x0015 => '1', + 0x0016 => '2', + 0x0017 => '3', + 0x0018 => '4', + 0x0019 => '5', + 0x001A => '6', + 0x001B => '7', + 0x001C => '8', + 0x001D => '9', + 0x001E => ':', + 0x001F => self::UNKNOWN_CID, + 0x0020 => self::UNKNOWN_CID, + 0x0021 => self::UNKNOWN_CID, + 0x0022 => self::UNKNOWN_CID, + 0x0023 => '?', + 0x0024 => '@', + 0x0025 => 'A', + 0x0026 => 'B', + 0x0027 => 'C', + 0x0028 => 'D', + 0x0029 => 'E', + 0x002A => 'F', + 0x002B => 'G', + 0x002C => 'H', + 0x002D => 'I', + 0x002E => 'J', + 0x002F => 'K', + 0x0030 => 'L', + 0x0031 => 'M', + 0x0032 => 'N', + 0x0033 => 'O', + 0x0034 => 'P', + 0x0035 => 'Q', + 0x0036 => 'R', + 0x0037 => 'S', + 0x0038 => 'T', + 0x0039 => 'U', + 0x003A => 'V', + 0x003B => 'W', + 0x003C => 'X', + 0x003D => 'Y', + 0x003E => 'Z', + 0x003F => self::UNKNOWN_CID, + 0x0040 => self::UNKNOWN_CID, + 0x0041 => self::UNKNOWN_CID, + 0x0042 => self::UNKNOWN_CID, + 0x0043 => self::UNKNOWN_CID, + 0x0044 => self::UNKNOWN_CID, + 0x0045 => 'a', + 0x0046 => 'b', + 0x0047 => 'c', + 0x0048 => 'd', + 0x0049 => 'e', + 0x004A => 'f', + 0x004B => 'g', + 0x004C => 'h', + 0x004D => 'i', + 0x004E => 'j', + 0x004F => 'k', + 0x0050 => 'l', + 0x0051 => 'm', + 0x0052 => 'n', + 0x0053 => 'o', + 0x0054 => 'p', + 0x0055 => 'q', + 0x0056 => 'r', + 0x0057 => 's', + 0x0058 => 't', + 0x0059 => 'u', + 0x005A => 'v', + 0x005B => 'w', + 0x005C => 'x', + 0x005D => 'y', + 0x005E => 'z', + 0x005F => self::UNKNOWN_CID, + 0x0060 => self::UNKNOWN_CID, + 0x0061 => self::UNKNOWN_CID, + 0x0062 => self::UNKNOWN_CID, + 0x0063 => ' ', + 0x0064 => self::UNKNOWN_CID, + 0x0065 => self::UNKNOWN_CID, + 0x0066 => self::UNKNOWN_CID, + 0x0067 => self::UNKNOWN_CID, + 0x0068 => self::UNKNOWN_CID, + 0x0069 => self::UNKNOWN_CID, + 0x006A => self::UNKNOWN_CID, + 0x006B => self::UNKNOWN_CID, + 0x006C => self::UNKNOWN_CID, + 0x006D => self::UNKNOWN_CID, + 0x006E => self::UNKNOWN_CID, + 0x006F => self::UNKNOWN_CID, + 0x0070 => self::UNKNOWN_CID, + 0x0071 => self::UNKNOWN_CID, + 0x0072 => self::UNKNOWN_CID, + 0x0073 => self::UNKNOWN_CID, + 0x0074 => self::UNKNOWN_CID, + 0x0075 => self::UNKNOWN_CID, + 0x0076 => self::UNKNOWN_CID, + 0x0077 => self::UNKNOWN_CID, + 0x0078 => self::UNKNOWN_CID, + 0x0079 => self::UNKNOWN_CID, + 0x007A => self::UNKNOWN_CID, + 0x007B => self::UNKNOWN_CID, + 0x007C => self::UNKNOWN_CID, + 0x007D => "\xC2\xB0", + 0x007E => self::UNKNOWN_CID, + 0x007F => self::UNKNOWN_CID, + 0x0080 => self::UNKNOWN_CID, + 0x0081 => self::UNKNOWN_CID, + 0x0082 => self::UNKNOWN_CID, + 0x0083 => self::UNKNOWN_CID, + 0x0084 => self::UNKNOWN_CID, + 0x0085 => self::UNKNOWN_CID, + 0x0086 => self::UNKNOWN_CID, + 0x0087 => self::UNKNOWN_CID, + 0x0088 => self::UNKNOWN_CID, + 0x0089 => self::UNKNOWN_CID, + 0x008A => self::UNKNOWN_CID, + 0x008B => self::UNKNOWN_CID, + 0x008C => "\xC3\x89", + 0x008D => self::UNKNOWN_CID, + 0x008E => self::UNKNOWN_CID, + 0x008F => self::UNKNOWN_CID, + 0x0090 => self::UNKNOWN_CID, + 0x0091 => self::UNKNOWN_CID, + 0x0092 => self::UNKNOWN_CID, + 0x0093 => self::UNKNOWN_CID, + 0x0094 => self::UNKNOWN_CID, + 0x0095 => self::UNKNOWN_CID, + 0x0096 => "\xC3\x93", + 0x0097 => self::UNKNOWN_CID, + 0x0098 => self::UNKNOWN_CID, + 0x0099 => self::UNKNOWN_CID, + 0x009A => self::UNKNOWN_CID, + 0x009B => self::UNKNOWN_CID, + 0x009C => self::UNKNOWN_CID, + 0x009D => self::UNKNOWN_CID, + 0x009E => self::UNKNOWN_CID, + 0x009F => self::UNKNOWN_CID, + 0x00A0 => self::UNKNOWN_CID, + 0x00A1 => self::UNKNOWN_CID, + 0x00A2 => self::UNKNOWN_CID, + 0x00A3 => self::UNKNOWN_CID, + 0x00A4 => "\xC3\xA1", + 0x00A5 => self::UNKNOWN_CID, + 0x00A6 => self::UNKNOWN_CID, + 0x00A7 => self::UNKNOWN_CID, + 0x00A8 => self::UNKNOWN_CID, + 0x00A9 => self::UNKNOWN_CID, + 0x00AA => self::UNKNOWN_CID, + 0x00AB => self::UNKNOWN_CID, + 0x00AC => "\xC3\xA9", + 0x00AD => self::UNKNOWN_CID, + 0x00AE => self::UNKNOWN_CID, + 0x00AF => self::UNKNOWN_CID, + 0x00B0 => "\xC3\xAD", + 0x00B1 => self::UNKNOWN_CID, + 0x00B2 => self::UNKNOWN_CID, + 0x00B3 => self::UNKNOWN_CID, + 0x00B4 => "\xC3\xB1", + 0x00B5 => self::UNKNOWN_CID, + 0x00B6 => "\xC3\xB3", + 0x00B7 => self::UNKNOWN_CID, + 0x00B8 => self::UNKNOWN_CID, + 0x00B9 => self::UNKNOWN_CID, + 0x00BA => self::UNKNOWN_CID, + 0x00BB => self::UNKNOWN_CID, + 0x00BC => self::UNKNOWN_CID, + 0x00BD => "\xC3\xB4", + 0x00BE => self::UNKNOWN_CID, + 0x00BF => self::UNKNOWN_CID, + 0x00C0 => self::UNKNOWN_CID, + 0x00C1 => self::UNKNOWN_CID, + 0x00C2 => self::UNKNOWN_CID, + 0x00C3 => self::UNKNOWN_CID, + 0x00C4 => self::UNKNOWN_CID, + 0x00C5 => self::UNKNOWN_CID, + 0x00C6 => self::UNKNOWN_CID, + 0x00C7 => self::UNKNOWN_CID, + 0x00C8 => self::UNKNOWN_CID, + 0x00C9 => self::UNKNOWN_CID, + 0x00CA => self::UNKNOWN_CID, + 0x00CB => self::UNKNOWN_CID, + 0x00CC => self::UNKNOWN_CID, + 0x00CD => self::UNKNOWN_CID, + 0x00CE => self::UNKNOWN_CID, + 0x00CF => self::UNKNOWN_CID, + 0x00D0 => self::UNKNOWN_CID, + 0x00D1 => self::UNKNOWN_CID, + 0x00D2 => self::UNKNOWN_CID, + 0x00D3 => self::UNKNOWN_CID, + 0x00D4 => self::UNKNOWN_CID, + 0x00D5 => self::UNKNOWN_CID, + 0x00D6 => self::UNKNOWN_CID, + 0x00D7 => self::UNKNOWN_CID, + 0x00D8 => self::UNKNOWN_CID, + 0x00D9 => self::UNKNOWN_CID, + 0x00DA => self::UNKNOWN_CID, + 0x00DB => self::UNKNOWN_CID, + 0x00DC => self::UNKNOWN_CID, + 0x00DD => self::UNKNOWN_CID, + 0x00DE => self::UNKNOWN_CID, + 0x00DF => self::UNKNOWN_CID, + 0x00E0 => self::UNKNOWN_CID, + 0x00E1 => self::UNKNOWN_CID, + 0x00E2 => self::UNKNOWN_CID, + 0x00E3 => self::UNKNOWN_CID, + 0x00E4 => self::UNKNOWN_CID, + 0x00E5 => self::UNKNOWN_CID, + 0x00E6 => self::UNKNOWN_CID, + 0x00E7 => self::UNKNOWN_CID, + 0x00E8 => self::UNKNOWN_CID, + 0x00E9 => self::UNKNOWN_CID, + 0x00EA => self::UNKNOWN_CID, + 0x00EB => self::UNKNOWN_CID, + 0x00EC => self::UNKNOWN_CID, + 0x00ED => self::UNKNOWN_CID, + 0x00EE => self::UNKNOWN_CID, + 0x00EF => self::UNKNOWN_CID, + 0x00F0 => self::UNKNOWN_CID, + 0x00F1 => self::UNKNOWN_CID, + 0x00F2 => self::UNKNOWN_CID, + 0x00F3 => self::UNKNOWN_CID, + 0x00F4 => self::UNKNOWN_CID, + 0x00F5 => self::UNKNOWN_CID, + 0x00F6 => self::UNKNOWN_CID, + 0x00F7 => self::UNKNOWN_CID, + 0x00F8 => self::UNKNOWN_CID, + 0x00F9 => self::UNKNOWN_CID, + 0x00FA => self::UNKNOWN_CID, + 0x00FB => self::UNKNOWN_CID, + 0x00FC => self::UNKNOWN_CID, + 0x00FD => self::UNKNOWN_CID, + 0x00FE => self::UNKNOWN_CID, + 0x0192 => "\xE2\x82\xAC", + ) + ) ; diff --git a/CIDTables/IDENTITY-H-PISJAS.cid b/CIDTables/IDENTITY-H-PISJAS.cid index 269098b..c820437 100755 --- a/CIDTables/IDENTITY-H-PISJAS.cid +++ b/CIDTables/IDENTITY-H-PISJAS.cid @@ -1,2 +1,2 @@ - array - ( - 0x0000 => self::UNKNOWN_CID, - 0x0001 => self::UNKNOWN_CID, - 0x0002 => self::UNKNOWN_CID, - 0x0003 => self::UNKNOWN_CID, - 0x0004 => self::UNKNOWN_CID, - 0x0005 => self::UNKNOWN_CID, - 0x0006 => self::UNKNOWN_CID, - 0x0007 => self::UNKNOWN_CID, - 0x0008 => self::UNKNOWN_CID, - 0x0009 => self::UNKNOWN_CID, - 0x000A => self::UNKNOWN_CID, - 0x000B => self::UNKNOWN_CID, - 0x000C => self::UNKNOWN_CID, - 0x000D => self::UNKNOWN_CID, - 0x000E => self::UNKNOWN_CID, - 0x000F => self::UNKNOWN_CID, - 0x0010 => self::UNKNOWN_CID, - 0x0011 => self::UNKNOWN_CID, - 0x0012 => self::UNKNOWN_CID, - 0x0013 => self::UNKNOWN_CID, - 0x0014 => self::UNKNOWN_CID, - 0x0015 => self::UNKNOWN_CID, - 0x0016 => self::UNKNOWN_CID, - 0x0017 => self::UNKNOWN_CID, - 0x0018 => self::UNKNOWN_CID, - 0x0019 => self::UNKNOWN_CID, - 0x001A => self::UNKNOWN_CID, - 0x001B => self::UNKNOWN_CID, - 0x001C => self::UNKNOWN_CID, - 0x001D => self::UNKNOWN_CID, - 0x001E => self::UNKNOWN_CID, - 0x001F => self::UNKNOWN_CID, - 0x0020 => self::UNKNOWN_CID, - 0x0021 => self::UNKNOWN_CID, - 0x0022 => self::UNKNOWN_CID, - 0x0023 => self::UNKNOWN_CID, - 0x0024 => self::UNKNOWN_CID, - 0x0025 => self::UNKNOWN_CID, - 0x0026 => self::UNKNOWN_CID, - 0x0027 => self::UNKNOWN_CID, - 0x0028 => self::UNKNOWN_CID, - 0x0029 => self::UNKNOWN_CID, - 0x002A => self::UNKNOWN_CID, - 0x002B => self::UNKNOWN_CID, - 0x002C => self::UNKNOWN_CID, - 0x002D => self::UNKNOWN_CID, - 0x002E => self::UNKNOWN_CID, - 0x002F => self::UNKNOWN_CID, - 0x0030 => self::UNKNOWN_CID, - 0x0031 => self::UNKNOWN_CID, - 0x0032 => self::UNKNOWN_CID, - 0x0033 => self::UNKNOWN_CID, - 0x0034 => self::UNKNOWN_CID, - 0x0035 => self::UNKNOWN_CID, - 0x0036 => self::UNKNOWN_CID, - 0x0037 => self::UNKNOWN_CID, - 0x0038 => self::UNKNOWN_CID, - 0x0039 => self::UNKNOWN_CID, - 0x003A => self::UNKNOWN_CID, - 0x003B => self::UNKNOWN_CID, - 0x003C => self::UNKNOWN_CID, - 0x003D => self::UNKNOWN_CID, - 0x003E => self::UNKNOWN_CID, - 0x003F => self::UNKNOWN_CID, - 0x0040 => self::UNKNOWN_CID, - 0x0041 => self::UNKNOWN_CID, - 0x0042 => self::UNKNOWN_CID, - 0x0043 => self::UNKNOWN_CID, - 0x0044 => self::UNKNOWN_CID, - 0x0045 => self::UNKNOWN_CID, - 0x0046 => self::UNKNOWN_CID, - 0x0047 => self::UNKNOWN_CID, - 0x0048 => self::UNKNOWN_CID, - 0x0049 => self::UNKNOWN_CID, - 0x004A => self::UNKNOWN_CID, - 0x004B => self::UNKNOWN_CID, - 0x004C => self::UNKNOWN_CID, - 0x004D => self::UNKNOWN_CID, - 0x004E => self::UNKNOWN_CID, - 0x004F => self::UNKNOWN_CID, - 0x0050 => self::UNKNOWN_CID, - 0x0051 => self::UNKNOWN_CID, - 0x0052 => self::UNKNOWN_CID, - 0x0053 => self::UNKNOWN_CID, - 0x0054 => self::UNKNOWN_CID, - 0x0055 => self::UNKNOWN_CID, - 0x0056 => self::UNKNOWN_CID, - 0x0057 => self::UNKNOWN_CID, - 0x0058 => self::UNKNOWN_CID, - 0x0059 => self::UNKNOWN_CID, - 0x005A => self::UNKNOWN_CID, - 0x005B => self::UNKNOWN_CID, - 0x005C => self::UNKNOWN_CID, - 0x005D => self::UNKNOWN_CID, - 0x005E => self::UNKNOWN_CID, - 0x005F => self::UNKNOWN_CID, - 0x0060 => self::UNKNOWN_CID, - 0x0061 => self::UNKNOWN_CID, - 0x0062 => self::UNKNOWN_CID, - 0x0063 => self::UNKNOWN_CID, - 0x0064 => self::UNKNOWN_CID, - 0x0065 => self::UNKNOWN_CID, - 0x0066 => self::UNKNOWN_CID, - 0x0067 => self::UNKNOWN_CID, - 0x0068 => self::UNKNOWN_CID, - 0x0069 => self::UNKNOWN_CID, - 0x006A => self::UNKNOWN_CID, - 0x006B => self::UNKNOWN_CID, - 0x006C => self::UNKNOWN_CID, - 0x006D => self::UNKNOWN_CID, - 0x006E => self::UNKNOWN_CID, - 0x006F => self::UNKNOWN_CID, - 0x0070 => self::UNKNOWN_CID, - 0x0071 => self::UNKNOWN_CID, - 0x0072 => self::UNKNOWN_CID, - 0x0073 => self::UNKNOWN_CID, - 0x0074 => self::UNKNOWN_CID, - 0x0075 => self::UNKNOWN_CID, - 0x0076 => self::UNKNOWN_CID, - 0x0077 => self::UNKNOWN_CID, - 0x0078 => self::UNKNOWN_CID, - 0x0079 => self::UNKNOWN_CID, - 0x007A => self::UNKNOWN_CID, - 0x007B => self::UNKNOWN_CID, - 0x007C => self::UNKNOWN_CID, - 0x007D => self::UNKNOWN_CID, - 0x007E => self::UNKNOWN_CID, - 0x007F => self::UNKNOWN_CID, - 0x0080 => self::UNKNOWN_CID, - 0x0081 => self::UNKNOWN_CID, - 0x0082 => self::UNKNOWN_CID, - 0x0083 => self::UNKNOWN_CID, - 0x0084 => self::UNKNOWN_CID, - 0x0085 => self::UNKNOWN_CID, - 0x0086 => self::UNKNOWN_CID, - 0x0087 => self::UNKNOWN_CID, - 0x0088 => self::UNKNOWN_CID, - 0x0089 => self::UNKNOWN_CID, - 0x008A => self::UNKNOWN_CID, - 0x008B => self::UNKNOWN_CID, - 0x008C => self::UNKNOWN_CID, - 0x008D => self::UNKNOWN_CID, - 0x008E => self::UNKNOWN_CID, - 0x008F => self::UNKNOWN_CID, - 0x0090 => self::UNKNOWN_CID, - 0x0091 => self::UNKNOWN_CID, - 0x0092 => self::UNKNOWN_CID, - 0x0093 => self::UNKNOWN_CID, - 0x0094 => self::UNKNOWN_CID, - 0x0095 => self::UNKNOWN_CID, - 0x0096 => self::UNKNOWN_CID, - 0x0097 => self::UNKNOWN_CID, - 0x0098 => self::UNKNOWN_CID, - 0x0099 => self::UNKNOWN_CID, - 0x009A => self::UNKNOWN_CID, - 0x009B => self::UNKNOWN_CID, - 0x009C => self::UNKNOWN_CID, - 0x009D => self::UNKNOWN_CID, - 0x009E => self::UNKNOWN_CID, - 0x009F => self::UNKNOWN_CID, - 0x00A0 => self::UNKNOWN_CID, - 0x00A1 => self::UNKNOWN_CID, - 0x00A2 => self::UNKNOWN_CID, - 0x00A3 => self::UNKNOWN_CID, - 0x00A4 => self::UNKNOWN_CID, - 0x00A5 => self::UNKNOWN_CID, - 0x00A6 => self::UNKNOWN_CID, - 0x00A7 => self::UNKNOWN_CID, - 0x00A8 => self::UNKNOWN_CID, - 0x00A9 => self::UNKNOWN_CID, - 0x00AA => self::UNKNOWN_CID, - 0x00AB => self::UNKNOWN_CID, - 0x00AC => self::UNKNOWN_CID, - 0x00AD => self::UNKNOWN_CID, - 0x00AE => self::UNKNOWN_CID, - 0x00AF => self::UNKNOWN_CID, - 0x00B0 => self::UNKNOWN_CID, - 0x00B1 => self::UNKNOWN_CID, - 0x00B2 => self::UNKNOWN_CID, - 0x00B3 => self::UNKNOWN_CID, - 0x00B4 => self::UNKNOWN_CID, - 0x00B5 => self::UNKNOWN_CID, - 0x00B6 => self::UNKNOWN_CID, - 0x00B7 => self::UNKNOWN_CID, - 0x00B8 => self::UNKNOWN_CID, - 0x00B9 => self::UNKNOWN_CID, - 0x00BA => self::UNKNOWN_CID, - 0x00BB => self::UNKNOWN_CID, - 0x00BC => self::UNKNOWN_CID, - 0x00BD => self::UNKNOWN_CID, - 0x00BE => self::UNKNOWN_CID, - 0x00BF => self::UNKNOWN_CID, - 0x00C0 => self::UNKNOWN_CID, - 0x00C1 => self::UNKNOWN_CID, - 0x00C2 => self::UNKNOWN_CID, - 0x00C3 => self::UNKNOWN_CID, - 0x00C4 => self::UNKNOWN_CID, - 0x00C5 => self::UNKNOWN_CID, - 0x00C6 => self::UNKNOWN_CID, - 0x00C7 => self::UNKNOWN_CID, - 0x00C8 => self::UNKNOWN_CID, - 0x00C9 => self::UNKNOWN_CID, - 0x00CA => self::UNKNOWN_CID, - 0x00CB => self::UNKNOWN_CID, - 0x00CC => self::UNKNOWN_CID, - 0x00CD => self::UNKNOWN_CID, - 0x00CE => self::UNKNOWN_CID, - 0x00CF => self::UNKNOWN_CID, - 0x00D0 => self::UNKNOWN_CID, - 0x00D1 => self::UNKNOWN_CID, - 0x00D2 => self::UNKNOWN_CID, - 0x00D3 => self::UNKNOWN_CID, - 0x00D4 => self::UNKNOWN_CID, - 0x00D5 => self::UNKNOWN_CID, - 0x00D6 => self::UNKNOWN_CID, - 0x00D7 => self::UNKNOWN_CID, - 0x00D8 => self::UNKNOWN_CID, - 0x00D9 => self::UNKNOWN_CID, - 0x00DA => self::UNKNOWN_CID, - 0x00DB => self::UNKNOWN_CID, - 0x00DC => self::UNKNOWN_CID, - 0x00DD => self::UNKNOWN_CID, - 0x00DE => self::UNKNOWN_CID, - 0x00DF => self::UNKNOWN_CID, - 0x00E0 => self::UNKNOWN_CID, - 0x00E1 => self::UNKNOWN_CID, - 0x00E2 => self::UNKNOWN_CID, - 0x00E3 => self::UNKNOWN_CID, - 0x00E4 => self::UNKNOWN_CID, - 0x00E5 => self::UNKNOWN_CID, - 0x00E6 => self::UNKNOWN_CID, - 0x00E7 => self::UNKNOWN_CID, - 0x00E8 => self::UNKNOWN_CID, - 0x00E9 => self::UNKNOWN_CID, - 0x00EA => self::UNKNOWN_CID, - 0x00EB => self::UNKNOWN_CID, - 0x00EC => self::UNKNOWN_CID, - 0x00ED => self::UNKNOWN_CID, - 0x00EE => self::UNKNOWN_CID, - 0x00EF => self::UNKNOWN_CID, - 0x00F0 => self::UNKNOWN_CID, - 0x00F1 => self::UNKNOWN_CID, - 0x00F2 => self::UNKNOWN_CID, - 0x00F3 => self::UNKNOWN_CID, - 0x00F4 => self::UNKNOWN_CID, - 0x00F5 => self::UNKNOWN_CID, - 0x00F6 => self::UNKNOWN_CID, - 0x00F7 => self::UNKNOWN_CID, - 0x00F8 => self::UNKNOWN_CID, - 0x00F9 => self::UNKNOWN_CID, - 0x00FA => self::UNKNOWN_CID, - 0x00FB => self::UNKNOWN_CID, - 0x00FC => self::UNKNOWN_CID, - 0x00FD => self::UNKNOWN_CID, - 0x00FE => self::UNKNOWN_CID, - ) + array + ( + 0x0000 => self::UNKNOWN_CID, + 0x0001 => self::UNKNOWN_CID, + 0x0002 => self::UNKNOWN_CID, + 0x0003 => self::UNKNOWN_CID, + 0x0004 => self::UNKNOWN_CID, + 0x0005 => self::UNKNOWN_CID, + 0x0006 => self::UNKNOWN_CID, + 0x0007 => self::UNKNOWN_CID, + 0x0008 => self::UNKNOWN_CID, + 0x0009 => self::UNKNOWN_CID, + 0x000A => self::UNKNOWN_CID, + 0x000B => self::UNKNOWN_CID, + 0x000C => self::UNKNOWN_CID, + 0x000D => self::UNKNOWN_CID, + 0x000E => self::UNKNOWN_CID, + 0x000F => self::UNKNOWN_CID, + 0x0010 => self::UNKNOWN_CID, + 0x0011 => self::UNKNOWN_CID, + 0x0012 => self::UNKNOWN_CID, + 0x0013 => self::UNKNOWN_CID, + 0x0014 => self::UNKNOWN_CID, + 0x0015 => self::UNKNOWN_CID, + 0x0016 => self::UNKNOWN_CID, + 0x0017 => self::UNKNOWN_CID, + 0x0018 => self::UNKNOWN_CID, + 0x0019 => self::UNKNOWN_CID, + 0x001A => self::UNKNOWN_CID, + 0x001B => self::UNKNOWN_CID, + 0x001C => self::UNKNOWN_CID, + 0x001D => self::UNKNOWN_CID, + 0x001E => self::UNKNOWN_CID, + 0x001F => self::UNKNOWN_CID, + 0x0020 => self::UNKNOWN_CID, + 0x0021 => self::UNKNOWN_CID, + 0x0022 => self::UNKNOWN_CID, + 0x0023 => self::UNKNOWN_CID, + 0x0024 => self::UNKNOWN_CID, + 0x0025 => self::UNKNOWN_CID, + 0x0026 => self::UNKNOWN_CID, + 0x0027 => self::UNKNOWN_CID, + 0x0028 => self::UNKNOWN_CID, + 0x0029 => self::UNKNOWN_CID, + 0x002A => self::UNKNOWN_CID, + 0x002B => self::UNKNOWN_CID, + 0x002C => self::UNKNOWN_CID, + 0x002D => self::UNKNOWN_CID, + 0x002E => self::UNKNOWN_CID, + 0x002F => self::UNKNOWN_CID, + 0x0030 => self::UNKNOWN_CID, + 0x0031 => self::UNKNOWN_CID, + 0x0032 => self::UNKNOWN_CID, + 0x0033 => self::UNKNOWN_CID, + 0x0034 => self::UNKNOWN_CID, + 0x0035 => self::UNKNOWN_CID, + 0x0036 => self::UNKNOWN_CID, + 0x0037 => self::UNKNOWN_CID, + 0x0038 => self::UNKNOWN_CID, + 0x0039 => self::UNKNOWN_CID, + 0x003A => self::UNKNOWN_CID, + 0x003B => self::UNKNOWN_CID, + 0x003C => self::UNKNOWN_CID, + 0x003D => self::UNKNOWN_CID, + 0x003E => self::UNKNOWN_CID, + 0x003F => self::UNKNOWN_CID, + 0x0040 => self::UNKNOWN_CID, + 0x0041 => self::UNKNOWN_CID, + 0x0042 => self::UNKNOWN_CID, + 0x0043 => self::UNKNOWN_CID, + 0x0044 => self::UNKNOWN_CID, + 0x0045 => self::UNKNOWN_CID, + 0x0046 => self::UNKNOWN_CID, + 0x0047 => self::UNKNOWN_CID, + 0x0048 => self::UNKNOWN_CID, + 0x0049 => self::UNKNOWN_CID, + 0x004A => self::UNKNOWN_CID, + 0x004B => self::UNKNOWN_CID, + 0x004C => self::UNKNOWN_CID, + 0x004D => self::UNKNOWN_CID, + 0x004E => self::UNKNOWN_CID, + 0x004F => self::UNKNOWN_CID, + 0x0050 => self::UNKNOWN_CID, + 0x0051 => self::UNKNOWN_CID, + 0x0052 => self::UNKNOWN_CID, + 0x0053 => self::UNKNOWN_CID, + 0x0054 => self::UNKNOWN_CID, + 0x0055 => self::UNKNOWN_CID, + 0x0056 => self::UNKNOWN_CID, + 0x0057 => self::UNKNOWN_CID, + 0x0058 => self::UNKNOWN_CID, + 0x0059 => self::UNKNOWN_CID, + 0x005A => self::UNKNOWN_CID, + 0x005B => self::UNKNOWN_CID, + 0x005C => self::UNKNOWN_CID, + 0x005D => self::UNKNOWN_CID, + 0x005E => self::UNKNOWN_CID, + 0x005F => self::UNKNOWN_CID, + 0x0060 => self::UNKNOWN_CID, + 0x0061 => self::UNKNOWN_CID, + 0x0062 => self::UNKNOWN_CID, + 0x0063 => self::UNKNOWN_CID, + 0x0064 => self::UNKNOWN_CID, + 0x0065 => self::UNKNOWN_CID, + 0x0066 => self::UNKNOWN_CID, + 0x0067 => self::UNKNOWN_CID, + 0x0068 => self::UNKNOWN_CID, + 0x0069 => self::UNKNOWN_CID, + 0x006A => self::UNKNOWN_CID, + 0x006B => self::UNKNOWN_CID, + 0x006C => self::UNKNOWN_CID, + 0x006D => self::UNKNOWN_CID, + 0x006E => self::UNKNOWN_CID, + 0x006F => self::UNKNOWN_CID, + 0x0070 => self::UNKNOWN_CID, + 0x0071 => self::UNKNOWN_CID, + 0x0072 => self::UNKNOWN_CID, + 0x0073 => self::UNKNOWN_CID, + 0x0074 => self::UNKNOWN_CID, + 0x0075 => self::UNKNOWN_CID, + 0x0076 => self::UNKNOWN_CID, + 0x0077 => self::UNKNOWN_CID, + 0x0078 => self::UNKNOWN_CID, + 0x0079 => self::UNKNOWN_CID, + 0x007A => self::UNKNOWN_CID, + 0x007B => self::UNKNOWN_CID, + 0x007C => self::UNKNOWN_CID, + 0x007D => self::UNKNOWN_CID, + 0x007E => self::UNKNOWN_CID, + 0x007F => self::UNKNOWN_CID, + 0x0080 => self::UNKNOWN_CID, + 0x0081 => self::UNKNOWN_CID, + 0x0082 => self::UNKNOWN_CID, + 0x0083 => self::UNKNOWN_CID, + 0x0084 => self::UNKNOWN_CID, + 0x0085 => self::UNKNOWN_CID, + 0x0086 => self::UNKNOWN_CID, + 0x0087 => self::UNKNOWN_CID, + 0x0088 => self::UNKNOWN_CID, + 0x0089 => self::UNKNOWN_CID, + 0x008A => self::UNKNOWN_CID, + 0x008B => self::UNKNOWN_CID, + 0x008C => self::UNKNOWN_CID, + 0x008D => self::UNKNOWN_CID, + 0x008E => self::UNKNOWN_CID, + 0x008F => self::UNKNOWN_CID, + 0x0090 => self::UNKNOWN_CID, + 0x0091 => self::UNKNOWN_CID, + 0x0092 => self::UNKNOWN_CID, + 0x0093 => self::UNKNOWN_CID, + 0x0094 => self::UNKNOWN_CID, + 0x0095 => self::UNKNOWN_CID, + 0x0096 => self::UNKNOWN_CID, + 0x0097 => self::UNKNOWN_CID, + 0x0098 => self::UNKNOWN_CID, + 0x0099 => self::UNKNOWN_CID, + 0x009A => self::UNKNOWN_CID, + 0x009B => self::UNKNOWN_CID, + 0x009C => self::UNKNOWN_CID, + 0x009D => self::UNKNOWN_CID, + 0x009E => self::UNKNOWN_CID, + 0x009F => self::UNKNOWN_CID, + 0x00A0 => self::UNKNOWN_CID, + 0x00A1 => self::UNKNOWN_CID, + 0x00A2 => self::UNKNOWN_CID, + 0x00A3 => self::UNKNOWN_CID, + 0x00A4 => self::UNKNOWN_CID, + 0x00A5 => self::UNKNOWN_CID, + 0x00A6 => self::UNKNOWN_CID, + 0x00A7 => self::UNKNOWN_CID, + 0x00A8 => self::UNKNOWN_CID, + 0x00A9 => self::UNKNOWN_CID, + 0x00AA => self::UNKNOWN_CID, + 0x00AB => self::UNKNOWN_CID, + 0x00AC => self::UNKNOWN_CID, + 0x00AD => self::UNKNOWN_CID, + 0x00AE => self::UNKNOWN_CID, + 0x00AF => self::UNKNOWN_CID, + 0x00B0 => self::UNKNOWN_CID, + 0x00B1 => self::UNKNOWN_CID, + 0x00B2 => self::UNKNOWN_CID, + 0x00B3 => self::UNKNOWN_CID, + 0x00B4 => self::UNKNOWN_CID, + 0x00B5 => self::UNKNOWN_CID, + 0x00B6 => self::UNKNOWN_CID, + 0x00B7 => self::UNKNOWN_CID, + 0x00B8 => self::UNKNOWN_CID, + 0x00B9 => self::UNKNOWN_CID, + 0x00BA => self::UNKNOWN_CID, + 0x00BB => self::UNKNOWN_CID, + 0x00BC => self::UNKNOWN_CID, + 0x00BD => self::UNKNOWN_CID, + 0x00BE => self::UNKNOWN_CID, + 0x00BF => self::UNKNOWN_CID, + 0x00C0 => self::UNKNOWN_CID, + 0x00C1 => self::UNKNOWN_CID, + 0x00C2 => self::UNKNOWN_CID, + 0x00C3 => self::UNKNOWN_CID, + 0x00C4 => self::UNKNOWN_CID, + 0x00C5 => self::UNKNOWN_CID, + 0x00C6 => self::UNKNOWN_CID, + 0x00C7 => self::UNKNOWN_CID, + 0x00C8 => self::UNKNOWN_CID, + 0x00C9 => self::UNKNOWN_CID, + 0x00CA => self::UNKNOWN_CID, + 0x00CB => self::UNKNOWN_CID, + 0x00CC => self::UNKNOWN_CID, + 0x00CD => self::UNKNOWN_CID, + 0x00CE => self::UNKNOWN_CID, + 0x00CF => self::UNKNOWN_CID, + 0x00D0 => self::UNKNOWN_CID, + 0x00D1 => self::UNKNOWN_CID, + 0x00D2 => self::UNKNOWN_CID, + 0x00D3 => self::UNKNOWN_CID, + 0x00D4 => self::UNKNOWN_CID, + 0x00D5 => self::UNKNOWN_CID, + 0x00D6 => self::UNKNOWN_CID, + 0x00D7 => self::UNKNOWN_CID, + 0x00D8 => self::UNKNOWN_CID, + 0x00D9 => self::UNKNOWN_CID, + 0x00DA => self::UNKNOWN_CID, + 0x00DB => self::UNKNOWN_CID, + 0x00DC => self::UNKNOWN_CID, + 0x00DD => self::UNKNOWN_CID, + 0x00DE => self::UNKNOWN_CID, + 0x00DF => self::UNKNOWN_CID, + 0x00E0 => self::UNKNOWN_CID, + 0x00E1 => self::UNKNOWN_CID, + 0x00E2 => self::UNKNOWN_CID, + 0x00E3 => self::UNKNOWN_CID, + 0x00E4 => self::UNKNOWN_CID, + 0x00E5 => self::UNKNOWN_CID, + 0x00E6 => self::UNKNOWN_CID, + 0x00E7 => self::UNKNOWN_CID, + 0x00E8 => self::UNKNOWN_CID, + 0x00E9 => self::UNKNOWN_CID, + 0x00EA => self::UNKNOWN_CID, + 0x00EB => self::UNKNOWN_CID, + 0x00EC => self::UNKNOWN_CID, + 0x00ED => self::UNKNOWN_CID, + 0x00EE => self::UNKNOWN_CID, + 0x00EF => self::UNKNOWN_CID, + 0x00F0 => self::UNKNOWN_CID, + 0x00F1 => self::UNKNOWN_CID, + 0x00F2 => self::UNKNOWN_CID, + 0x00F3 => self::UNKNOWN_CID, + 0x00F4 => self::UNKNOWN_CID, + 0x00F5 => self::UNKNOWN_CID, + 0x00F6 => self::UNKNOWN_CID, + 0x00F7 => self::UNKNOWN_CID, + 0x00F8 => self::UNKNOWN_CID, + 0x00F9 => self::UNKNOWN_CID, + 0x00FA => self::UNKNOWN_CID, + 0x00FB => self::UNKNOWN_CID, + 0x00FC => self::UNKNOWN_CID, + 0x00FD => self::UNKNOWN_CID, + 0x00FE => self::UNKNOWN_CID, + ) ) ; \ No newline at end of file diff --git a/CIDTables/IDENTITY-H.cid b/CIDTables/IDENTITY-H.cid index 7c596db..c9a747b 100755 --- a/CIDTables/IDENTITY-H.cid +++ b/CIDTables/IDENTITY-H.cid @@ -1,406 +1,406 @@ - array - ( - 0 => self::ALT_CID, // Seems to be a prefix for accentuated characters - 1 => self::ALT_CID, // Same - 2 => self::UNKNOWN_CID, - 3 => ' ', - 4 => '!', - 5 => '"', - 6 => self::UNKNOWN_CID, - 7 => self::UNKNOWN_CID, - 8 => '%', - 9 => '&', - 10 => "'", - 11 => '(', - 12 => ')', - 13 => '*', - 14 => '+', - 15 => ',', - 16 => '-', - 17 => '.', - 18 => '/', - 19 => '0', - 20 => '1', - 21 => '2', - 22 => '3', - 23 => '4', - 24 => '5', - 25 => '6', - 26 => '7', - 27 => '8', - 28 => '9', - 29 => ':', - 30 => ';', - 31 => '<', - 32 => '=', - 33 => '>', - 34 => '?', - 35 => '@', - 36 => 'A', - 37 => 'B', - 38 => 'C', - 39 => 'D', - 40 => 'E', - 41 => 'F', - 42 => 'G', - 43 => 'H', - 44 => 'I', - 45 => 'J', - 46 => 'K', - 47 => 'L', - 48 => 'M', - 49 => 'N', - 50 => 'O', - 51 => 'P', - 52 => 'Q', - 53 => 'R', - 54 => 'S', - 55 => 'T', - 56 => 'U', - 57 => 'V', - 58 => 'W', - 59 => 'X', - 60 => 'Y', - 61 => 'Z', - 62 => '[', - 63 => '\\', - 64 => ']', - 65 => '^', - 66 => '_', - 67 => '`', - 68 => 'a', - 69 => 'b', - 70 => 'c', - 71 => 'd', - 72 => 'e', - 73 => 'f', - 74 => 'g', - 75 => 'h', - 76 => 'i', - 77 => 'j', - 78 => 'k', - 79 => 'l', - 80 => 'm', - 81 => 'n', - 82 => 'o', - 83 => 'p', - 84 => 'q', - 85 => 'r', - 86 => 's', - 87 => 't', - 88 => 'u', - 89 => 'v', - 90 => 'w', - 91 => 'x', - 92 => 'y', - 93 => 'z', - 94 => self::UNKNOWN_CID, - 95 => self::UNKNOWN_CID, - 96 => self::UNKNOWN_CID, - 97 => self::UNKNOWN_CID, - 98 => self::UNKNOWN_CID, - 99 => self::UNKNOWN_CID, - 100 => self::UNKNOWN_CID, - 101 => 'É', - 102 => self::UNKNOWN_CID, - 103 => self::UNKNOWN_CID, - 104 => self::UNKNOWN_CID, - 105 => self::UNKNOWN_CID, - 106 => 'à', - 107 => 'â', - 108 => self::UNKNOWN_CID, - 109 => self::UNKNOWN_CID, - 110 => self::UNKNOWN_CID, - 111 => self::UNKNOWN_CID, - 112 => 'é', - 113 => 'è', - 114 => self::UNKNOWN_CID, - 115 => self::UNKNOWN_CID, - 116 => self::UNKNOWN_CID, - 117 => self::UNKNOWN_CID, - 118 => self::UNKNOWN_CID, - 119 => self::UNKNOWN_CID, - 120 => self::UNKNOWN_CID, - 121 => 'ó', - 122 => self::UNKNOWN_CID, - 123 => 'ô', - 124 => 'ö', - 125 => self::UNKNOWN_CID, - 126 => 'ú', - 127 => self::UNKNOWN_CID, - 128 => 'û', - 131 => '°', - 172 => 'À', - 177 => '-', - 182 => "'", - 188 => '€' - ), - 'alt' => array - ( - 0 => array - ( - 0 => self::UNKNOWN_CID, - 1 => self::UNKNOWN_CID, - 2 => self::UNKNOWN_CID, - 3 => ' ', - 4 => self::UNKNOWN_CID, - 5 => self::UNKNOWN_CID, - 6 => self::UNKNOWN_CID, - 7 => self::UNKNOWN_CID, - 8 => self::UNKNOWN_CID, - 9 => self::UNKNOWN_CID, - 10 => 'ę', - 11 => '(', - 12 => ')', - 13 => self::UNKNOWN_CID, - 14 => '+', - 15 => ',', - 16 => '-', - 17 => '.', - 18 => '/', - 19 => '0', - 20 => '1', - 21 => '2', - 22 => '3', - 23 => '4', - 24 => '5', - 25 => '6', - 26 => '7', - 27 => '8', - 28 => '9', - 29 => ':', - 30 => ';', - 31 => self::UNKNOWN_CID, - 32 => self::UNKNOWN_CID, - 33 => self::UNKNOWN_CID, - 34 => self::UNKNOWN_CID, - 35 => self::UNKNOWN_CID, - 36 => 'A', - 37 => 'B', - 38 => 'C', - 39 => 'D', - 40 => 'E', - 41 => 'F', - 42 => 'G', - 43 => 'H', - 44 => 'I', - 45 => 'J', - 46 => 'K', - 47 => 'L', - 48 => 'M', - 49 => 'N', - 50 => 'O', - 51 => 'P', - 52 => 'Q', - 53 => 'R', - 54 => 'S', - 55 => 'T', - 56 => 'U', - 57 => 'V', - 58 => 'W', - 59 => 'X', - 60 => 'Y', - 61 => 'Z', - 62 => self::UNKNOWN_CID, - 63 => self::UNKNOWN_CID, - 64 => self::UNKNOWN_CID, - 65 => self::UNKNOWN_CID, - 66 => self::UNKNOWN_CID, - 67 => self::UNKNOWN_CID, - 68 => 'a', - 69 => 'b', - 70 => 'c', - 71 => 'd', - 72 => 'e', - 73 => 'f', - 74 => 'g', - 75 => 'h', - 76 => 'i', - 77 => 'j', - 78 => 'k', - 79 => 'l', - 80 => 'm', - 81 => 'n', - 82 => 'o', - 83 => 'p', - 84 => 'q', - 85 => 'r', - 86 => 's', - 87 => 't', - 88 => 'u', - 89 => 'v', - 90 => 'w', - 91 => 'x', - 92 => 'y', - 93 => 'z', - 94 => self::UNKNOWN_CID, - 95 => self::UNKNOWN_CID, - 96 => self::UNKNOWN_CID, - 97 => self::UNKNOWN_CID, - 98 => self::UNKNOWN_CID, - 99 => self::UNKNOWN_CID, - 100 => self::UNKNOWN_CID, - 101 => self::UNKNOWN_CID, - 102 => self::UNKNOWN_CID, - 103 => self::UNKNOWN_CID, - 104 => self::UNKNOWN_CID, - 105 => self::UNKNOWN_CID, - 106 => self::UNKNOWN_CID, - 107 => self::UNKNOWN_CID, - 108 => self::UNKNOWN_CID, - 109 => self::UNKNOWN_CID, - 110 => self::UNKNOWN_CID, - 111 => self::UNKNOWN_CID, - 112 => self::UNKNOWN_CID, - 113 => self::UNKNOWN_CID, - 114 => self::UNKNOWN_CID, - 115 => self::UNKNOWN_CID, - 116 => self::UNKNOWN_CID, - 117 => self::UNKNOWN_CID, - 118 => self::UNKNOWN_CID, - 119 => self::UNKNOWN_CID, - 120 => self::UNKNOWN_CID, - 121 => self::UNKNOWN_CID, - 122 => self::UNKNOWN_CID, - 123 => self::UNKNOWN_CID, - 124 => self::UNKNOWN_CID, - 125 => self::UNKNOWN_CID, - 126 => self::UNKNOWN_CID, - 127 => self::UNKNOWN_CID - ), - 1 => array - ( - 0 => self::UNKNOWN_CID, - 1 => self::UNKNOWN_CID, - 2 => self::UNKNOWN_CID, - 3 => self::UNKNOWN_CID, - 4 => self::UNKNOWN_CID, - 5 => 'ą', - 6 => self::UNKNOWN_CID, - 7 => self::UNKNOWN_CID, - 8 => self::UNKNOWN_CID, - 9 => self::UNKNOWN_CID, - 10 => self::UNKNOWN_CID, - 11 => self::UNKNOWN_CID, - 12 => self::UNKNOWN_CID, - 13 => self::UNKNOWN_CID, - 14 => self::UNKNOWN_CID, - 15 => self::UNKNOWN_CID, - 16 => self::UNKNOWN_CID, - 17 => self::UNKNOWN_CID, - 18 => self::UNKNOWN_CID, - 19 => self::UNKNOWN_CID, - 20 => 'ń', - 21 => self::UNKNOWN_CID, - 22 => self::UNKNOWN_CID, - 23 => self::UNKNOWN_CID, - 24 => self::UNKNOWN_CID, - 25 => self::UNKNOWN_CID, - 26 => self::UNKNOWN_CID, - 27 => self::UNKNOWN_CID, - 28 => self::UNKNOWN_CID, - 29 => 'Ś', - 30 => 'ṥ', - 31 => self::UNKNOWN_CID, - 32 => self::UNKNOWN_CID, - 33 => self::UNKNOWN_CID, - 34 => self::UNKNOWN_CID, - 35 => self::UNKNOWN_CID, - 36 => self::UNKNOWN_CID, - 37 => self::UNKNOWN_CID, - 38 => self::UNKNOWN_CID, - 39 => self::UNKNOWN_CID, - 40 => self::UNKNOWN_CID, - 41 => 'Ż', - 42 => 'ż', - 43 => self::UNKNOWN_CID, - 44 => self::UNKNOWN_CID, - 45 => self::UNKNOWN_CID, - 46 => self::UNKNOWN_CID, - 47 => self::UNKNOWN_CID, - 48 => self::UNKNOWN_CID, - 49 => self::UNKNOWN_CID, - 50 => self::UNKNOWN_CID, - 51 => self::UNKNOWN_CID, - 52 => self::UNKNOWN_CID, - 53 => self::UNKNOWN_CID, - 54 => self::UNKNOWN_CID, - 55 => self::UNKNOWN_CID, - 56 => self::UNKNOWN_CID, - 57 => self::UNKNOWN_CID, - 58 => self::UNKNOWN_CID, - 59 => self::UNKNOWN_CID, - 60 => self::UNKNOWN_CID, - 61 => self::UNKNOWN_CID, - 62 => self::UNKNOWN_CID, - 63 => self::UNKNOWN_CID, - 64 => self::UNKNOWN_CID, - 65 => self::UNKNOWN_CID, - 66 => self::UNKNOWN_CID, - 67 => self::UNKNOWN_CID, - 68 => self::UNKNOWN_CID, - 69 => self::UNKNOWN_CID, - 70 => self::UNKNOWN_CID, - 71 => self::UNKNOWN_CID, - 72 => self::UNKNOWN_CID, - 73 => self::UNKNOWN_CID, - 74 => self::UNKNOWN_CID, - 75 => self::UNKNOWN_CID, - 76 => self::UNKNOWN_CID, - 77 => self::UNKNOWN_CID, - 78 => self::UNKNOWN_CID, - 79 => self::UNKNOWN_CID, - 80 => self::UNKNOWN_CID, - 81 => self::UNKNOWN_CID, - 82 => self::UNKNOWN_CID, - 83 => self::UNKNOWN_CID, - 84 => self::UNKNOWN_CID, - 85 => self::UNKNOWN_CID, - 86 => self::UNKNOWN_CID, - 87 => self::UNKNOWN_CID, - 88 => self::UNKNOWN_CID, - 89 => self::UNKNOWN_CID, - 90 => self::UNKNOWN_CID, - 91 => self::UNKNOWN_CID, - 92 => self::UNKNOWN_CID, - 93 => self::UNKNOWN_CID, - 94 => self::UNKNOWN_CID, - 95 => self::UNKNOWN_CID, - 96 => self::UNKNOWN_CID, - 97 => self::UNKNOWN_CID, - 98 => self::UNKNOWN_CID, - 99 => self::UNKNOWN_CID, - 100 => self::UNKNOWN_CID, - 101 => self::UNKNOWN_CID, - 102 => self::UNKNOWN_CID, - 103 => self::UNKNOWN_CID, - 104 => self::UNKNOWN_CID, - 105 => self::UNKNOWN_CID, - 106 => self::UNKNOWN_CID, - 107 => self::UNKNOWN_CID, - 108 => self::UNKNOWN_CID, - 109 => self::UNKNOWN_CID, - 110 => self::UNKNOWN_CID, - 111 => self::UNKNOWN_CID, - 112 => self::UNKNOWN_CID, - 113 => self::UNKNOWN_CID, - 114 => self::UNKNOWN_CID, - 115 => self::UNKNOWN_CID, - 116 => self::UNKNOWN_CID, - 117 => self::UNKNOWN_CID, - 118 => self::UNKNOWN_CID, - 119 => self::UNKNOWN_CID, - 120 => self::UNKNOWN_CID, - 121 => self::UNKNOWN_CID, - 122 => self::UNKNOWN_CID, - 123 => self::UNKNOWN_CID, - 124 => self::UNKNOWN_CID, - 125 => self::UNKNOWN_CID, - 126 => self::UNKNOWN_CID, - 127 => self::UNKNOWN_CID, - ) - ) - ) ; + array + ( + 0 => self::ALT_CID, // Seems to be a prefix for accentuated characters + 1 => self::ALT_CID, // Same + 2 => self::UNKNOWN_CID, + 3 => ' ', + 4 => '!', + 5 => '"', + 6 => self::UNKNOWN_CID, + 7 => self::UNKNOWN_CID, + 8 => '%', + 9 => '&', + 10 => "'", + 11 => '(', + 12 => ')', + 13 => '*', + 14 => '+', + 15 => ',', + 16 => '-', + 17 => '.', + 18 => '/', + 19 => '0', + 20 => '1', + 21 => '2', + 22 => '3', + 23 => '4', + 24 => '5', + 25 => '6', + 26 => '7', + 27 => '8', + 28 => '9', + 29 => ':', + 30 => ';', + 31 => '<', + 32 => '=', + 33 => '>', + 34 => '?', + 35 => '@', + 36 => 'A', + 37 => 'B', + 38 => 'C', + 39 => 'D', + 40 => 'E', + 41 => 'F', + 42 => 'G', + 43 => 'H', + 44 => 'I', + 45 => 'J', + 46 => 'K', + 47 => 'L', + 48 => 'M', + 49 => 'N', + 50 => 'O', + 51 => 'P', + 52 => 'Q', + 53 => 'R', + 54 => 'S', + 55 => 'T', + 56 => 'U', + 57 => 'V', + 58 => 'W', + 59 => 'X', + 60 => 'Y', + 61 => 'Z', + 62 => '[', + 63 => '\\', + 64 => ']', + 65 => '^', + 66 => '_', + 67 => '`', + 68 => 'a', + 69 => 'b', + 70 => 'c', + 71 => 'd', + 72 => 'e', + 73 => 'f', + 74 => 'g', + 75 => 'h', + 76 => 'i', + 77 => 'j', + 78 => 'k', + 79 => 'l', + 80 => 'm', + 81 => 'n', + 82 => 'o', + 83 => 'p', + 84 => 'q', + 85 => 'r', + 86 => 's', + 87 => 't', + 88 => 'u', + 89 => 'v', + 90 => 'w', + 91 => 'x', + 92 => 'y', + 93 => 'z', + 94 => self::UNKNOWN_CID, + 95 => self::UNKNOWN_CID, + 96 => self::UNKNOWN_CID, + 97 => self::UNKNOWN_CID, + 98 => self::UNKNOWN_CID, + 99 => self::UNKNOWN_CID, + 100 => self::UNKNOWN_CID, + 101 => 'É', + 102 => self::UNKNOWN_CID, + 103 => self::UNKNOWN_CID, + 104 => self::UNKNOWN_CID, + 105 => self::UNKNOWN_CID, + 106 => 'à', + 107 => 'â', + 108 => self::UNKNOWN_CID, + 109 => self::UNKNOWN_CID, + 110 => self::UNKNOWN_CID, + 111 => self::UNKNOWN_CID, + 112 => 'é', + 113 => 'è', + 114 => self::UNKNOWN_CID, + 115 => self::UNKNOWN_CID, + 116 => self::UNKNOWN_CID, + 117 => self::UNKNOWN_CID, + 118 => self::UNKNOWN_CID, + 119 => self::UNKNOWN_CID, + 120 => self::UNKNOWN_CID, + 121 => 'ó', + 122 => self::UNKNOWN_CID, + 123 => 'ô', + 124 => 'ö', + 125 => self::UNKNOWN_CID, + 126 => 'ú', + 127 => self::UNKNOWN_CID, + 128 => 'û', + 131 => '°', + 172 => 'À', + 177 => '-', + 182 => "'", + 188 => '€' + ), + 'alt' => array + ( + 0 => array + ( + 0 => self::UNKNOWN_CID, + 1 => self::UNKNOWN_CID, + 2 => self::UNKNOWN_CID, + 3 => ' ', + 4 => self::UNKNOWN_CID, + 5 => self::UNKNOWN_CID, + 6 => self::UNKNOWN_CID, + 7 => self::UNKNOWN_CID, + 8 => self::UNKNOWN_CID, + 9 => self::UNKNOWN_CID, + 10 => 'ę', + 11 => '(', + 12 => ')', + 13 => self::UNKNOWN_CID, + 14 => '+', + 15 => ',', + 16 => '-', + 17 => '.', + 18 => '/', + 19 => '0', + 20 => '1', + 21 => '2', + 22 => '3', + 23 => '4', + 24 => '5', + 25 => '6', + 26 => '7', + 27 => '8', + 28 => '9', + 29 => ':', + 30 => ';', + 31 => self::UNKNOWN_CID, + 32 => self::UNKNOWN_CID, + 33 => self::UNKNOWN_CID, + 34 => self::UNKNOWN_CID, + 35 => self::UNKNOWN_CID, + 36 => 'A', + 37 => 'B', + 38 => 'C', + 39 => 'D', + 40 => 'E', + 41 => 'F', + 42 => 'G', + 43 => 'H', + 44 => 'I', + 45 => 'J', + 46 => 'K', + 47 => 'L', + 48 => 'M', + 49 => 'N', + 50 => 'O', + 51 => 'P', + 52 => 'Q', + 53 => 'R', + 54 => 'S', + 55 => 'T', + 56 => 'U', + 57 => 'V', + 58 => 'W', + 59 => 'X', + 60 => 'Y', + 61 => 'Z', + 62 => self::UNKNOWN_CID, + 63 => self::UNKNOWN_CID, + 64 => self::UNKNOWN_CID, + 65 => self::UNKNOWN_CID, + 66 => self::UNKNOWN_CID, + 67 => self::UNKNOWN_CID, + 68 => 'a', + 69 => 'b', + 70 => 'c', + 71 => 'd', + 72 => 'e', + 73 => 'f', + 74 => 'g', + 75 => 'h', + 76 => 'i', + 77 => 'j', + 78 => 'k', + 79 => 'l', + 80 => 'm', + 81 => 'n', + 82 => 'o', + 83 => 'p', + 84 => 'q', + 85 => 'r', + 86 => 's', + 87 => 't', + 88 => 'u', + 89 => 'v', + 90 => 'w', + 91 => 'x', + 92 => 'y', + 93 => 'z', + 94 => self::UNKNOWN_CID, + 95 => self::UNKNOWN_CID, + 96 => self::UNKNOWN_CID, + 97 => self::UNKNOWN_CID, + 98 => self::UNKNOWN_CID, + 99 => self::UNKNOWN_CID, + 100 => self::UNKNOWN_CID, + 101 => self::UNKNOWN_CID, + 102 => self::UNKNOWN_CID, + 103 => self::UNKNOWN_CID, + 104 => self::UNKNOWN_CID, + 105 => self::UNKNOWN_CID, + 106 => self::UNKNOWN_CID, + 107 => self::UNKNOWN_CID, + 108 => self::UNKNOWN_CID, + 109 => self::UNKNOWN_CID, + 110 => self::UNKNOWN_CID, + 111 => self::UNKNOWN_CID, + 112 => self::UNKNOWN_CID, + 113 => self::UNKNOWN_CID, + 114 => self::UNKNOWN_CID, + 115 => self::UNKNOWN_CID, + 116 => self::UNKNOWN_CID, + 117 => self::UNKNOWN_CID, + 118 => self::UNKNOWN_CID, + 119 => self::UNKNOWN_CID, + 120 => self::UNKNOWN_CID, + 121 => self::UNKNOWN_CID, + 122 => self::UNKNOWN_CID, + 123 => self::UNKNOWN_CID, + 124 => self::UNKNOWN_CID, + 125 => self::UNKNOWN_CID, + 126 => self::UNKNOWN_CID, + 127 => self::UNKNOWN_CID + ), + 1 => array + ( + 0 => self::UNKNOWN_CID, + 1 => self::UNKNOWN_CID, + 2 => self::UNKNOWN_CID, + 3 => self::UNKNOWN_CID, + 4 => self::UNKNOWN_CID, + 5 => 'ą', + 6 => self::UNKNOWN_CID, + 7 => self::UNKNOWN_CID, + 8 => self::UNKNOWN_CID, + 9 => self::UNKNOWN_CID, + 10 => self::UNKNOWN_CID, + 11 => self::UNKNOWN_CID, + 12 => self::UNKNOWN_CID, + 13 => self::UNKNOWN_CID, + 14 => self::UNKNOWN_CID, + 15 => self::UNKNOWN_CID, + 16 => self::UNKNOWN_CID, + 17 => self::UNKNOWN_CID, + 18 => self::UNKNOWN_CID, + 19 => self::UNKNOWN_CID, + 20 => 'ń', + 21 => self::UNKNOWN_CID, + 22 => self::UNKNOWN_CID, + 23 => self::UNKNOWN_CID, + 24 => self::UNKNOWN_CID, + 25 => self::UNKNOWN_CID, + 26 => self::UNKNOWN_CID, + 27 => self::UNKNOWN_CID, + 28 => self::UNKNOWN_CID, + 29 => 'Ś', + 30 => 'ṥ', + 31 => self::UNKNOWN_CID, + 32 => self::UNKNOWN_CID, + 33 => self::UNKNOWN_CID, + 34 => self::UNKNOWN_CID, + 35 => self::UNKNOWN_CID, + 36 => self::UNKNOWN_CID, + 37 => self::UNKNOWN_CID, + 38 => self::UNKNOWN_CID, + 39 => self::UNKNOWN_CID, + 40 => self::UNKNOWN_CID, + 41 => 'Ż', + 42 => 'ż', + 43 => self::UNKNOWN_CID, + 44 => self::UNKNOWN_CID, + 45 => self::UNKNOWN_CID, + 46 => self::UNKNOWN_CID, + 47 => self::UNKNOWN_CID, + 48 => self::UNKNOWN_CID, + 49 => self::UNKNOWN_CID, + 50 => self::UNKNOWN_CID, + 51 => self::UNKNOWN_CID, + 52 => self::UNKNOWN_CID, + 53 => self::UNKNOWN_CID, + 54 => self::UNKNOWN_CID, + 55 => self::UNKNOWN_CID, + 56 => self::UNKNOWN_CID, + 57 => self::UNKNOWN_CID, + 58 => self::UNKNOWN_CID, + 59 => self::UNKNOWN_CID, + 60 => self::UNKNOWN_CID, + 61 => self::UNKNOWN_CID, + 62 => self::UNKNOWN_CID, + 63 => self::UNKNOWN_CID, + 64 => self::UNKNOWN_CID, + 65 => self::UNKNOWN_CID, + 66 => self::UNKNOWN_CID, + 67 => self::UNKNOWN_CID, + 68 => self::UNKNOWN_CID, + 69 => self::UNKNOWN_CID, + 70 => self::UNKNOWN_CID, + 71 => self::UNKNOWN_CID, + 72 => self::UNKNOWN_CID, + 73 => self::UNKNOWN_CID, + 74 => self::UNKNOWN_CID, + 75 => self::UNKNOWN_CID, + 76 => self::UNKNOWN_CID, + 77 => self::UNKNOWN_CID, + 78 => self::UNKNOWN_CID, + 79 => self::UNKNOWN_CID, + 80 => self::UNKNOWN_CID, + 81 => self::UNKNOWN_CID, + 82 => self::UNKNOWN_CID, + 83 => self::UNKNOWN_CID, + 84 => self::UNKNOWN_CID, + 85 => self::UNKNOWN_CID, + 86 => self::UNKNOWN_CID, + 87 => self::UNKNOWN_CID, + 88 => self::UNKNOWN_CID, + 89 => self::UNKNOWN_CID, + 90 => self::UNKNOWN_CID, + 91 => self::UNKNOWN_CID, + 92 => self::UNKNOWN_CID, + 93 => self::UNKNOWN_CID, + 94 => self::UNKNOWN_CID, + 95 => self::UNKNOWN_CID, + 96 => self::UNKNOWN_CID, + 97 => self::UNKNOWN_CID, + 98 => self::UNKNOWN_CID, + 99 => self::UNKNOWN_CID, + 100 => self::UNKNOWN_CID, + 101 => self::UNKNOWN_CID, + 102 => self::UNKNOWN_CID, + 103 => self::UNKNOWN_CID, + 104 => self::UNKNOWN_CID, + 105 => self::UNKNOWN_CID, + 106 => self::UNKNOWN_CID, + 107 => self::UNKNOWN_CID, + 108 => self::UNKNOWN_CID, + 109 => self::UNKNOWN_CID, + 110 => self::UNKNOWN_CID, + 111 => self::UNKNOWN_CID, + 112 => self::UNKNOWN_CID, + 113 => self::UNKNOWN_CID, + 114 => self::UNKNOWN_CID, + 115 => self::UNKNOWN_CID, + 116 => self::UNKNOWN_CID, + 117 => self::UNKNOWN_CID, + 118 => self::UNKNOWN_CID, + 119 => self::UNKNOWN_CID, + 120 => self::UNKNOWN_CID, + 121 => self::UNKNOWN_CID, + 122 => self::UNKNOWN_CID, + 123 => self::UNKNOWN_CID, + 124 => self::UNKNOWN_CID, + 125 => self::UNKNOWN_CID, + 126 => self::UNKNOWN_CID, + 127 => self::UNKNOWN_CID, + ) + ) + ) ; diff --git a/CREDITS.md b/CREDITS.md index 047b9c9..bfd632e 100755 --- a/CREDITS.md +++ b/CREDITS.md @@ -1,56 +1,56 @@ -# INTRODUCTION # - -I wanted to warmly thank a whole bunch of people here that helped me to enhance my **PdfToText** class. Of course, there is still a lot of work to do, but without their help, I could not have achieved anything reliable. - -# INSPIRATIONS # - -My first thanks go to the following people : - -- The people at Adobe who wrote the Pdf File format reference (I used version 1.7 of this reference, available here : [http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf](http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf "http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf")). As for all specifications and standards, it leaves room for ambiguity, but this is a high quality document that every people concerned with PDF issues should read first (well, as far as you can spen enough time to walk through the 1300 pages that this document contains...) -- The phpclasses.org site, which allowed me to publish this class and provided me with a medium to help new users of my class to solve issues -- The "*unknown developer*" ; when I have been asked for the first time at work to extract text from pdf files, I have been provided with the code referenced here : [contributions/pdftotext.php](contributions/pdftotext.php "contributions/pdftotext.php"). I don't know who this developer is (the source code did not contain any name), but I would like to really thank him, because his works were able to rapidly give me some knowledge of the PDF file format. Although I developed my class my own way, I borrowed from him the **decodeAsciiHex** and **decodeAscii85** functions. -- I also would like to thank Adeel Ahmad Khan whose works gave me further understanding of the Pdf file format ([https://github.com/adeel/php-pdf-parser](https://github.com/adeel/php-pdf-parser "https://github.com/adeel/php-pdf-parser")). -- The author of the TCPDF package - -# USERS OF PDFTOTEXT # - -When I first published the **PdfToText** class on the *phpclasses.org* site, I already knew that it was not able to handle all the possible situations. The Pdf file format is so versatile that I could not get enough samples to check my class against them. - -This is why I clearly asked users to send me sample Pdf files whenever they encountered issues on them, to help me enhance my class ; and every user played the game, so I would like to thank the following people, in completely random order (and hoping that I did not miss anyone of them...) : - -- Pawel Lancucki and Blaine Hilton, who helped me to document more precisely on the fact that the class required a PHP version greater or equal to 5.5 -- Stephen Layton, who provided me with samples and actively tested my new versions -- Theodis Butler, who also provided me with samples and helped me solve some issues -- Rafael Rojas Torres, who gave me the idea of handling password-protected pdf files. Although not yet implemented, this definitely is on my roadmap. -- Rolf Kellner , who had issues with unicode translations on far-east and middle-east languages. I have not yet solved them, but I'm still working on it ! -- Yuri Kadeev, who had issues with data presented in tabular format. I won't be able to solve all of them, but it helped me a lot to solve issues presented by other users. -- Steve Majors, from CashFlowProducts.com, who gave me several samples to work on and even provided me with an access to his bug-tracking system. He also gave me the idea to implement PDF form data extraction. -- Antonio Jùnior, who gave me a sample using text images encoded in a format I did not handle yet ; still under work... -- Tom Perro and the user named *srizoophari*, because they gave me the idea of implementing some features which allow for searching text page by page (and handle page contents separately, instead of a single block of text) -- Shishant Todi, who gave me pdf samples built in a very strange way. I'm still working on them ! -- Menny Grossmann, for yet another way to write pdf files in tabular format -- Carin Pretorius, who sent me samples with the biggest character maps I've ever seen -- Rolf Mast, who sent me samples that (finally) did not use character maps and allowed me to solve more easily a few bugs in the way I was parsing text drawing instructions -- Rajnish Tatiwala who, among other things, sent me my first sample containing objects encoded with the CCITT FAX standard. -- Jocemar Varela, who sent me samples that were generated using OCR software -- Aryan Schmitz, who tried to port the PdfToText class to a system running PHP 5.2. This successful attempt led to long and detailed support exchanges, where I really appreciated his proactivity. -- Francisco Godoy, who sent me a PDF sample which was enough simple to help me solve a long-time issue I had with PDF files using inline "templates" -- Patryk, from expromo.pl, who helped me to handle font aliases that are local to a page by providing me with samples easy enough to investigate ; the same samples also helped me - to start an experimental support of CID fonts. -- DL, from aloha-intbiz.eu, who suggested me to give the option of saving images on-the-fly without storing them into memory, and who gave me samples that helped me to process more cases during image extraction. -- Kis Balazs, who gave me a sample which helped me understand why sometimes certain accentuated characters where incorrectly translated. -- Massimo Baglione, who sent me sample covering various artistic domains, and helped me identify and correct -- Aldo Mariussi, which supplied me a 1-page sample that helped me enhance the way 2-bytes character sequences are decoded when specified between parentheses -- Jens Kirk and Roshayne Jaimon, who sent me samples containing compound objects, which incited me to entirely review the way I was handling them. As a side-effect, this new way of handling PDF objects solved as a miracle 3 other bugs ! -- Youen Toupin, with a sample containing embedded binary images in regular PDF object streams, which is a feature I was not aware of -- Javier Diez, who submitted a sample created by Quark XPress and helped me solve basic problems not seen so far -- Thomas Bourgeois, who sent me a very comprehensive set of small PDF samples showing issues, some of them were existing for long -- Rob Webster, who sent me my first sample using the LZW compression algorithm -- Darren Jett for his support in helping me to start implementing page layout rendering -- Manuel Osuna, who gave me the idea of defining text capture areas inside a PDF file -- Luis Manuel Reyes, who submitted me samples that helped me improve text layout rendering -- Piotr Markowski, who gave me links to thousands of documents written in Polish, which helped me to considerably enhance my character maps -- And much more... - -Although I did not solved all the issues yet, I would like to thank you all for your contributions and your help ! - +# INTRODUCTION # + +I wanted to warmly thank a whole bunch of people here that helped me to enhance my **PdfToText** class. Of course, there is still a lot of work to do, but without their help, I could not have achieved anything reliable. + +# INSPIRATIONS # + +My first thanks go to the following people : + +- The people at Adobe who wrote the Pdf File format reference (I used version 1.7 of this reference, available here : [http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf](http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf "http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf")). As for all specifications and standards, it leaves room for ambiguity, but this is a high quality document that every people concerned with PDF issues should read first (well, as far as you can spen enough time to walk through the 1300 pages that this document contains...) +- The phpclasses.org site, which allowed me to publish this class and provided me with a medium to help new users of my class to solve issues +- The "*unknown developer*" ; when I have been asked for the first time at work to extract text from pdf files, I have been provided with the code referenced here : [contributions/pdftotext.php](contributions/pdftotext.php "contributions/pdftotext.php"). I don't know who this developer is (the source code did not contain any name), but I would like to really thank him, because his works were able to rapidly give me some knowledge of the PDF file format. Although I developed my class my own way, I borrowed from him the **decodeAsciiHex** and **decodeAscii85** functions. +- I also would like to thank Adeel Ahmad Khan whose works gave me further understanding of the Pdf file format ([https://github.com/adeel/php-pdf-parser](https://github.com/adeel/php-pdf-parser "https://github.com/adeel/php-pdf-parser")). +- The author of the TCPDF package + +# USERS OF PDFTOTEXT # + +When I first published the **PdfToText** class on the *phpclasses.org* site, I already knew that it was not able to handle all the possible situations. The Pdf file format is so versatile that I could not get enough samples to check my class against them. + +This is why I clearly asked users to send me sample Pdf files whenever they encountered issues on them, to help me enhance my class ; and every user played the game, so I would like to thank the following people, in completely random order (and hoping that I did not miss anyone of them...) : + +- Pawel Lancucki and Blaine Hilton, who helped me to document more precisely on the fact that the class required a PHP version greater or equal to 5.5 +- Stephen Layton, who provided me with samples and actively tested my new versions +- Theodis Butler, who also provided me with samples and helped me solve some issues +- Rafael Rojas Torres, who gave me the idea of handling password-protected pdf files. Although not yet implemented, this definitely is on my roadmap. +- Rolf Kellner , who had issues with unicode translations on far-east and middle-east languages. I have not yet solved them, but I'm still working on it ! +- Yuri Kadeev, who had issues with data presented in tabular format. I won't be able to solve all of them, but it helped me a lot to solve issues presented by other users. +- Steve Majors, from CashFlowProducts.com, who gave me several samples to work on and even provided me with an access to his bug-tracking system. He also gave me the idea to implement PDF form data extraction. +- Antonio Jùnior, who gave me a sample using text images encoded in a format I did not handle yet ; still under work... +- Tom Perro and the user named *srizoophari*, because they gave me the idea of implementing some features which allow for searching text page by page (and handle page contents separately, instead of a single block of text) +- Shishant Todi, who gave me pdf samples built in a very strange way. I'm still working on them ! +- Menny Grossmann, for yet another way to write pdf files in tabular format +- Carin Pretorius, who sent me samples with the biggest character maps I've ever seen +- Rolf Mast, who sent me samples that (finally) did not use character maps and allowed me to solve more easily a few bugs in the way I was parsing text drawing instructions +- Rajnish Tatiwala who, among other things, sent me my first sample containing objects encoded with the CCITT FAX standard. +- Jocemar Varela, who sent me samples that were generated using OCR software +- Aryan Schmitz, who tried to port the PdfToText class to a system running PHP 5.2. This successful attempt led to long and detailed support exchanges, where I really appreciated his proactivity. +- Francisco Godoy, who sent me a PDF sample which was enough simple to help me solve a long-time issue I had with PDF files using inline "templates" +- Patryk, from expromo.pl, who helped me to handle font aliases that are local to a page by providing me with samples easy enough to investigate ; the same samples also helped me + to start an experimental support of CID fonts. +- DL, from aloha-intbiz.eu, who suggested me to give the option of saving images on-the-fly without storing them into memory, and who gave me samples that helped me to process more cases during image extraction. +- Kis Balazs, who gave me a sample which helped me understand why sometimes certain accentuated characters where incorrectly translated. +- Massimo Baglione, who sent me sample covering various artistic domains, and helped me identify and correct +- Aldo Mariussi, which supplied me a 1-page sample that helped me enhance the way 2-bytes character sequences are decoded when specified between parentheses +- Jens Kirk and Roshayne Jaimon, who sent me samples containing compound objects, which incited me to entirely review the way I was handling them. As a side-effect, this new way of handling PDF objects solved as a miracle 3 other bugs ! +- Youen Toupin, with a sample containing embedded binary images in regular PDF object streams, which is a feature I was not aware of +- Javier Diez, who submitted a sample created by Quark XPress and helped me solve basic problems not seen so far +- Thomas Bourgeois, who sent me a very comprehensive set of small PDF samples showing issues, some of them were existing for long +- Rob Webster, who sent me my first sample using the LZW compression algorithm +- Darren Jett for his support in helping me to start implementing page layout rendering +- Manuel Osuna, who gave me the idea of defining text capture areas inside a PDF file +- Luis Manuel Reyes, who submitted me samples that helped me improve text layout rendering +- Piotr Markowski, who gave me links to thousands of documents written in Polish, which helped me to considerably enhance my character maps +- And much more... + +Although I did not solved all the issues yet, I would like to thank you all for your contributions and your help ! + diff --git a/FormTemplates/US-IRS-W9.xml b/FormTemplates/US-IRS-W9.xml index 5288d27..dad16a0 100755 --- a/FormTemplates/US-IRS-W9.xml +++ b/FormTemplates/US-IRS-W9.xml @@ -1,39 +1,39 @@ - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
\ No newline at end of file diff --git a/HISTORY.txt b/HISTORY.txt index 1e4eeb6..a50161f 100755 --- a/HISTORY.txt +++ b/HISTORY.txt @@ -1,926 +1,926 @@ - [Version : 1.6.6] [Date : 2017/05/22] [Author : CV] - . Completely rebuilt the page layout rendering algorithm - . Some character widths were not correctly extracted because of line breaks in the widths list. - . Fixed an issue where a character map was sometimes instantiated with the wrong parameter. - . Correctly handle character widths for characters defined by CharProcs (ie, for which the only - information we have is how to draw the glyph, but no Unicode equivalent) and the corresponding - character names that may have been passed to the AddAdobeExtraMappings() method. - . Properly decode sequences of hex digits when there is no current font applicable. - . Completed the Unicode to Ansi character map - - [Version : 1.6.5] [Date : 2017/05/20] [Author : CV] - . Complemented the Unicode to Ansi mapping table. - . Added the AddAdobeExtraMappings() method, to complement the standard Adobe character maps when given - character names refer to a glyph that has no Unicode equivalent. - . Added the MarkTextLike() method to mark certain portions of text based on their font name and size. - . Changed the GetCaptures() method to return by default a collection of stdClass objects instead of - PdfToText objects whose contents takes time to be displayed when using the print_r() function. - The new boolean parameter $full allows to return PdfToText objects instead when set to true. - - [Version : 1.6.3] [Date : 2017/05/17] [Author : CV] - . Changed the $CharacterClasses table, which was causing some constructs, such as T*, not to be - recognized as a single instruction. - . Fixed a decoding bug when a series of hex digits enclosed in '<>' also contained spaces and newlines - (which should have been ignored). - . Allow names in the /Differences array to use the '#xy' notation, where 'xy' are hex digits. - . Significantly complemented character maps for the four Adobe predefined character sets. - . Text captures : changed the behavior of the definitions ; now, captured areas are - accessible by their page number, instead of a sequential index starting from zero. A capture is - defined for each page of the document, even if not in the list of applicable pages. Of course, empty - captures will be contained in the list if nothing was captured on the corresponding page. - - [Version : 1.6.2] [Date : 2017/05/17] [Author : CV] - . The adobe-charsets.map file was searched in the wrong directory. - - [Version : 1.6.1] [Date : 2017/05/12] [Author : CV] - . Text captures : - - Fixed a mistake where columns having empty values were not present in the object returned by the - GetCaptures() method. - . Complemented the table that maps Unicode special characters such as spaces, quotes, hyphens, etc. to - their Ascii counterpart (mainly for quotes) - . Enforced the ability of the class to work in 'degraded mode' whem some external tables are missing. - . Complemented the four Adobe standard character sets with more entity names, such as Polish characters, - Greek characters and special symbols (hundreds of symbols added). - . Moved the adobe-charsets.map file to the new Maps directory - . Created a Maps/unicode-to-ascii.map mapping file. - - [Version : 1.6.0] [Date : 2017/05/08] [Author : CV] - . Added the possibility to capture areas of text : - - The SetCaptures() and SetCapturesFromString() methods define the pages and areas of text within the - pages to be captured (see file README.md for more information). It can be used to define rectangle - shapes or line/columns information - - The GetCaptures() method returns an object containing the captured text areas - - Added the PDFOPT_LOOSE_X_CAPTURE and PDFOPT_LOOSE_Y_CAPTURE to include text that might exceed the - captured area, but whose top/left coordinates are included in the captured area. - . Complemented the Adobe 4 standard character set encodings, to include Polish characters. - . Exported the Adobe standard character sets to external file 'adobe-charsets.map'. - - [Version : 1.5.8] [Date : 2017/05/01] [Author : CV] - . Added undocumented aliases for stream encoding (for example, /Fl stands for /FlateDecode). - - [Version : 1.5.7] [Date : 2017/04/26] [Author : CV] - . Added the possibility to extract form data : - - Added the HasFormData(), GetFormCount() and GetFormData() methods - - Data extraction based on XML form templates, which maps form fields to human-readable ones - - Form data is returned as an object inherited from the PdfToTextFormData class - . Added the PDFOPT_DEBUG_SHOW_COORDINATES option, which shows coordinates of every text block in the - output text. This option has been designed for the future feature to be implemented, that will allow - to capture text areas. - . Added the Subject and Keywords properties regarding author information - . Changed the text/document_strxpos() methods to use the mb_strxpos() functions instead of strxpos(). - The supplied searched string must be encoded in UTF-8. - . Added the PdfToTextBase::GetStringParameter() method, which is able to retrieve parameter values such - as : - /FlagName (parameter value) - and : - /FlagName - - [Version : 1.5.6] [Date : 2017/04/21] [Author : CV] - . Added font metrics information for the Adobe Standard 14 fonts, which are hardcoded (in new directory - FontMetrics). This includes Times, Helvetica and Courier with their variations (bold, italic, etc.) - along with the Symbol and ZapDingbats fonts. Currently, font information relates to individual - character widths, which are used for page layout rendering. - . Enhanced layout rendering, which was giving strange results due to improper handling of certain - positioning instructions. - . Complemented the $UnicodeToSimpleAscii table to include special hyphens - - [Version : 1.5.5] [Date : 2017/04/20] [Author : CV] - . Added the $UnicodeToSimpleAscii table, which maps Unicode characters which can have an ASCII - equivalent. For example, German "fi" with ligature (U+FB01) becomes ASCII string "fi" ; special - spaces (such as unbreakable space) become an ASCII space (0x20), etc. - . Fixed a warning issued when a page entry does not contain a width and a height. - . Suppressed a warning in non-debug mode when an unsupported encryption algorithm has been found - - [Version : 1.5.4] [Date : 2017/04/07] [Author : CV] - . Fixed an issue with the /Kids flag, whose parameters are normally the ids of the objects containing - a page's contents. Sometimes, there is an additional indirection level : the parameter of the /Kids - flag is an obect which in turn contains the ids of page contents objects. Not handling this - situation caused some page contents to be missed. - . Some line breaks were not respected - - [Version : 1.5.3] [Date : 2017/04/06] [Author : CV] - . Enhanced page layout rendering : - - Better handle text positioning, especially when a line contains super/subscripted text - - Implemented additional text positioning instructions - - First experimental implementation of templates (which are correctly implemented in the non-page - layout version) - - Fixed a regression intoduced in versions 1.5.1 and 1.5.2. - . Completed some missing codes for the WinAnsiEncoding font, which is far bigger than stated in the - PDF specifications. - . Modified the default value of the ExtraTextWidth property to -5%, since most of the widths - computed by the GetStringWidth() method are a little bit too large. - - [Version : 1.5.2] [Date : 2017/04/05] [Author : CV] - . Page layout rendering enhancements : - - Interpret more instructions - - Fixed a problem where two lines, the first one having the biggest font, were joined together - . Fixed a bug in the Unescape() method which allowed octal sequences of more than 3 characters to be - interpreted, giving an incorrect character code as output. - . Some drawing objects were not recognized in the input ; this could affect page layout rendering. - . The document_strpos() methods were not returning the correct page number, but a zero-based offset - (the Pages property is indexed by the actual page number) - - [Version : 1.5.1] [Date : 2017/03/31] [Author : CV] - . Corrected a bug in the LZW decompression algorithm. - - [Version : 1.5.0] [Date : 2017/03/31] [Author : CV] - . Implemented page layout rendering !!! - - Text parts are displayed in the same order as Acrobat Reader displays them - - Spaces are inserted in the same line when necessary (the BlockSeparator property can be used - to separate items that are on the same line, at different -x-coordinates) - - The PDFOPT_RAW_LAYOUT and PDFOPT_BASIC_LAYOUT options have been added. The default is - PDFOPT_RAW_LAYOUT, which behaves as in the previous versions. The PDFOPT_BASIC_LAYOUT will - activate layout rendering. - - The new ExtraTextWidth property allows to adjust the computed text widths to help determine if - two consecutive blocks of text on the same line should be separated by a whitespace or not. - - Hold a separate set of instructions to be removed from PDF stream before interpretation, - depending on whether the page layout option has been activated or not. This is done not to impact - the performance of this class for callers that use it the traditional way. - This new feature allows to process more efficiently PDF files presenting tabular data or form data - (such as tickets reservations for example). Note that the PDFOPT_BASIC_LAYOUT option simply ensures - that items coming from the PDF file are shown in the correct order and not improperly concatenated ; - it does not visually reproduce what you could see with Acrobat Reader. - . Font descriptors were mistakenly interpreted as font entries. - - [Version : 1.4.19] [Date : 2017/03/25] [Author : CV] - . Added the PDFOPT_ENHANCED_STATISTICS flag (for debugging/optimization purposes) - . Added more useless instructions to be removed from the PDF input stream before processing - . Fixed a warning issued when processing certain objects not having stream data - . Fixed a warning issued when a character map does not contain a begincodespacerange/endcodespacerange - construct. - . Fixed a warning issued by the MapKids() method when a page catalog refers to a non-existing object. - . Re-established PCRE error handling during PDF file processing, since the pcre.backtrack_limit clearly - imposes a limit on the size of the data that can be captured (it does not only depends on the - complexity of the regular expression). This means that PDF file scanning will be stopping with an - exception if the size of the object to be captured exceeds such a limit. - . Fixed inappropriate warnings about undefined fonts - - [Version : 1.4.18] [Date : 2017/03/25] [Author : CV] - . Fixed a warning issued when author information in buggy PDFs referred to non-existing objects. - . The PageSeparator property was not taken into account by the GetPageFromOffset() method. - . Fixed a regression which caused an exception to be thrown if the PDF document did not exactly start - with '%PDF' - . Remove more useless instruction from the input stream before processing it (graphic-related - instructions). - . Trying to process object streams that contain invalid gzip data led to an infinite loop. - . Handle buggy PDF containing object streams which do not start with an even number of integer values - (this should normally be a list of object number/offset pairs) - . The CodePointToUtf8() function was running into an inifinite loop when the high order bit of the - supplied value was set (unsigned right-shift operator does not exist in PHP 5.*). - . Handle another kind of buggy PDF that have a page catalog referring to a non-existing object ; in this - case the behavior is the same as if there is no page catalog at all : everything is grouped onto a - single page. - - [Version : 1.4.17] [Date : 2017/03/21] [Author : CV] - . Drawing instructions between BX/EX were unduly removed, causing some text to be missing sometimes in - the output. - . Fixed an inappropriate property setting in class PdfTexterTimeoutException - . Handle the case where /XObjects contents are not inline but specify another object with the inline - contents. This caused some text to be missed in the output. - . When a Unicode font had a secondary character map (representing the /Differences array), the - secondary cmap was not searched if the character to be mapped was not also defined in the primary - cmap. This caused some mappings to be missed. - - [Version : 1.4.16] [Date : 2017/03/19] [Author : CV] - . Completely reviewed the way PDF objects are parsed ; the original code sometimes forced the user to - change the pcre.backtrack_limit PHP setting to abnormal values (14 000 000 for parsing the Adobe - PDF Specifications document itself !) - . Implemented the LZW decompression algorithm for text objects. - - [Version : 1.4.15] [Date : 2017/03/17] [Author : CV] - . Added the PDFOPT_IGNORE_HEADERS_AND_FOOTERS option. The previous behavior was to ignore them - systematically. - . Changed the PdfTexterFont object to handle basic encodings - . Refactored class for decryption support : - - Moved some functions to the PfTexterObjectBase object - - Created the PdfEncryptionData class to hold encryption data defined in the PDF file, by - transfering all the encryption-related properties from the PdfToText class to PdfEncryptionData - - Added the EncryptionData property, which currently has the "protected" visibility - - Renamed the IsPasswordProtected property to IsEncrypted - Note : processing of encrypted files is not yet functional - - [Version : 1.4.14] [Date : 2017/03/14] [Author : CV] - . Fixed a regression in text decoding when compound text contained angle brackets and right square - brackets (originally fixed in 1.4.12, but regressed in 1.4.13). - - [Version : 1.4.13] [Date : 2017/03/13] [Author : CV] - . Updated IDENTITY-H CID font mapping to add more characters. - . Handle the case where a character map states that it handles 2-bytes character codes (using the - begincoderange/endcoderange constructs) while the map itself only contains 1-byte character codes... - . Changed character maps that are secondary to a Unicode map to handle only the characters listed in - the /Differences parameter. - . Updated the offical Adobe WinAnsi character map to include UTF8 codes which have no Windows - equivalent (PdfTexterEncodingMap::$Encodings array). - - [Version : 1.4.12] [Date : 2017/03/12] [Author : CV] - . Added the LoadFromString() method, which allows to process PDF contents directly from a string. - . Changed the Load() method to be able to handle remote urls. - . Fixed a bug in the __next_token() method which caused contents following a regular square bracket - character to be wrongly interpreted. - . Integrated a modification resolving the interpretation of escaped octal sequences, that was missed - in version 1.4.11. - . Fixed : The new PdfToTextDecodingException class did not report the supplied error message. - - [Version : 1.4.11] [Date : 2017/03/11] [Author : CV] - . Corrected decoding of Ascii85 data (the original version from the 'unknown developer' did consider - that the '%' character was the start of a comment, which is not the case in Ascii85 encoded data) - . Handle the case where data using Ascii85 encoding contains in turn gzipped data - . Optimized the __decode_ascii85() method - . Characters using octal escape sequences were not correctly interpreted when followed by digits after - the 3rd one of the escape sequence. This so&metimes caused incorrect character decoding. - . Added timeout handling features ; when the PDFOPT_ENFORCE_EXECUTION_TIME or - PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME are specified, the $MaxExecutionTime property and - $MaxGlobalExecutionTime static property (respectively) will be used to prevent exceeding the PHP setting - 'max_execution_time'. If such a situation happens, a PdfToTextTimeoutException exception will be - thrown. - . Added the $MaxExtractedImages property, to limit the number of extracted images. - - [Version : 1.4.10] [Date : 2017/03/09] [Author : CV] - . Fixed an issue when decoding object streams : a lack of whitespace after the list of object number/ - offset pairs implied a shift of one character in object data extraction, making most of the objects - contained in the object stream to be missed. - - [Version : 1.4.9] [Date : 2017/03/09] [Author : CV] - . Handle the case where character mappings specified through the /Differences keyword can contain - constructs such as '/uniabcd', where 'abcd' is a sequence of hex digits representing a Unicode code - point. - . Found rare cases where a Unicode map mapped character IDs to a value between 0xF000 and 0xF0FF, which - do not correspond to any Unicode codepoint, but rather to an undocumented font. The class - PdfTexterAdobeUndocumentedFont has been designed to handle such mappings. - - [Version : 1.4.8] [Date : 2017/03/08] [Author : CV] - . The new PdfTexterAdobeMap classes did not specify the width of character codes, which caused bad - interpretation of characters expressed as octal escaped sequences. - . Fixed a regression, where the numeric value of octal escape sequences was displayed as is - . Fixed a warning issued when an empty date is specified in Author information - . Fixed a warning issued when instantianting objects of class PdfTexterFont with a missing 4th argument. - . Object streams (compound objects) are now discarded from the object list after processing their - embedded objects - . Fixed bad interpretation of font sizes less than 1 with no leading zero (eg, '.93') - . Better process text contents using characters escaped in octal notation - - [Version : 1.4.7] [Date : 2017/03/05] [Author : CV] - . First (and partial) implementation of CID fonts for the eastern Europe languages (currently tested on - Polish). - . Fixed a warning about an undefined $map variable in class PdfTexterIdentityHCIDFont. - - [Version : 1.4.6] [Date : 2017/03/04] [Author : CV] - . Fixed errors regarding undefined PdfTexterFont::$WinAnsiCharacterMap and $MacRomanCharacter map, which - had been moved to the PdfTexterAdobeWinAnsiMap and PdfTexterAdobeMacRomanMap classes, respectively. - - [Version : 1.4.5] [Date : 2017/03/05] [Author : CV] - . Implemented Cyrillic fonts (ISO-8859-5). - . Created the PdfTexterAdobe*Map classes, to implement the WinAnsi and Mac Roman encodings, instead of - implementing them as tables at the PdfTexterFont class level. - - [Version : 1.4.4] [Date : 2017/03/04] [Author : CV] - . Fixed several issues that affected text output : - - Completely changed the way "beginbfrange..endbfrange" constructs are handled ; some interpretation - problems occurred when line breaks were present at unexpected places, giving incorrect character - mappings. - - Unified the way escaped characters are handled in a text stream ; sometimes, the escaped character - appeared as is in the output, instead of being mapped. - - Handle more character escapes such as backspace, which has no equivalent in PHP. - . Handle the case where author information contains parentheses, which are also used as delimiters - - [Version : 1.4.3] [Date : 2017/03/03] [Author : CV] - . Handle the case where Unicode fonts can also have an associated /Encoding object, specifying a - /Differences flag that maps character ids to other character ids. - . The checkings performed on special /Contents references which reference an object holding the list - of the objects contained in the page was a little bit too relaxed and, in some rare cases, caused - a few text blocks to be missing in the output (this feature was introduced in version 1.4.1). - . Corrected a warning in method GetMappedFonts() - . For better performance, remove all useless instructions at the start and end of each drawing - instruction block. - . Started expanding the PdfTexterEncodingFont::$Encodings table to add character mappings with - character names not described in the PDF specifications. - . Better handle recognition of author information (sometimes, the hex2bin() function issued a warning) - - [Version : 1.4.2] [Date : 2017/03/02] [Author : CV] - . Fixed mapping issue when the same character map was referenced by several font definitions (only the - first definition was associated to the character map, which caused incorrect character mappings when - subsequent fonts were used). - . The regular expression catching author information was a little bit too greedy, causing some - misinterpretations and warnings in the output. - - [Version : 1.4.1] [Date : 2017/02/27] [Author : CV] - . Enhanced page contents decoding (sometimes, the parameter of the /Contents flags does not reference - objects containing drawing instructions, but objects containing the list of objects which in turn - contain the real drawing instructions). This caused in rare cases some contents to be missed in the - output. - . Changed the way compound objects are handled : instead of decoding them on-the-fly, preprocess them - before any other processing takes place. This ensure that forward references to objects defined later - in the pdf file will be satisfied. - . Ignore embedded images inside drawing instructions blocks ; the presence of gzipped data inside the - embedded image could cause misinterpretation of drawing instructions following it. For the current - version, embedded images will not be integrated into the image-extraction process. - . Fixed issue where the same group of text was extracted several times with some PDF samples. - . Better handle Japanese documents - - [Version : 1.4.0] [Date : 26/02/2017] [Author : CV] - . First implementation for handling languages written from right-to-left (RTL). - - [Version : 1.3.18] [Date : 25/02/2017] [Author : CV] - . Corrected the PdfTexterUnicodeMap class, which did not correctly decode character maps in files - generated on Apple where line-endings are carriage returns. As a result, output looked like garbage - data. - - [Version : 1.3.17] [Date : 25/02/2017] [Author : CV] - . Completely rewrote the way text specified between parentheses, either as plain text or 2-bytes - character codes, is processed. Some character values preceded with a backslash are escape sequences, - which were not recognized in all cases, thus causing a shift when interpreting 2-bytes values and a - bad mapping for the 2-bytes sequences that followed. - - [Version : 1.3.16] [Date : 2017/02/23] [Author : CV] - . Changed the GetFontByMapId() method which was incorrectly searching a global font instead of searching - first for a page-specific font. - . Compound objects were not correctly handled when object number/offset pairs were separated by a newline - (str_replace() was called instead of preg_replace). This caused some objects to be missed. - . Performed some optimizations which allow for a performance gain of 5 to 10 percent in certain cases. - . Removed the suppression of carriage returns, as this is the only line separator used by Adobe software - on Apple. - - [Version : 1.3.15] [Date : 2017/02/12] [Author : CV] - . Handles author information whose keyword values refer to existing object contents instead of - referring to a direct value - . Handles new constructions for beginbfchar/endbfchar constructs, which can act as beginbfrange ; - Example : - <21> <0009 0020 000d> - means : - . Map character #21 to #0009 - . Map character #22 to #0020 - . Map character #23 to #000D - There is no clue in the Adobe PDF specification that a single character could be mapped to a range. - The normal constructs would be : - <21> <0009> - <22> <0020> - <23> <0000D> - . Regular expressions matching Postscript instructions to be removed before interpreting the remaining - contents were sometimes catenating one instruction with the first parameter of the following one, - which caused bad interpretation, some warnings and bad handling of the layout (some lines could be - catenated together). - . Changed the MinSpaceWidth value from 250 to 200 (certain files separate words with lower spacing - values) - - [Version : 1.3.14] [Date : 2017/02/07] [Author : CV] - . Fixed the *_strpos methods which did not return correct page information any more - . Added new font aliases possibilities (/0 through /9 and /a through /z) - - [Version : 1.3.13] [Date : 2017/02/05] [Author : CV] - . Added the MaxSelectedPages property to extract only the first or last x pages of the document. - . Pure JPEG images are no more loaded into memory (using the gd library) when the - PDFOPT_AUTOSAVE_IMAGES flag is specified. - - [Version : 1.3.12] [Date : 2017/02/01] [Author : CV] - . Enhanced image extraction by adding support for more image formats, notably those having the - /FlateDecode flag. The new supported image formats are : - - Standard JPEG data, not initially specified as a real image - - Image data encoded as : - . RGB color values - . CMYK color values - . Gray scale color values - Currently, only 8-bits color components are supported. - . Added the PdfInlinedImage class, and changed the AddImage() and DecodeImage() methods to handle - these new image processing enhancements. + [Version : 1.6.6] [Date : 2017/05/22] [Author : CV] + . Completely rebuilt the page layout rendering algorithm + . Some character widths were not correctly extracted because of line breaks in the widths list. + . Fixed an issue where a character map was sometimes instantiated with the wrong parameter. + . Correctly handle character widths for characters defined by CharProcs (ie, for which the only + information we have is how to draw the glyph, but no Unicode equivalent) and the corresponding + character names that may have been passed to the AddAdobeExtraMappings() method. + . Properly decode sequences of hex digits when there is no current font applicable. + . Completed the Unicode to Ansi character map + + [Version : 1.6.5] [Date : 2017/05/20] [Author : CV] + . Complemented the Unicode to Ansi mapping table. + . Added the AddAdobeExtraMappings() method, to complement the standard Adobe character maps when given + character names refer to a glyph that has no Unicode equivalent. + . Added the MarkTextLike() method to mark certain portions of text based on their font name and size. + . Changed the GetCaptures() method to return by default a collection of stdClass objects instead of + PdfToText objects whose contents takes time to be displayed when using the print_r() function. + The new boolean parameter $full allows to return PdfToText objects instead when set to true. + + [Version : 1.6.3] [Date : 2017/05/17] [Author : CV] + . Changed the $CharacterClasses table, which was causing some constructs, such as T*, not to be + recognized as a single instruction. + . Fixed a decoding bug when a series of hex digits enclosed in '<>' also contained spaces and newlines + (which should have been ignored). + . Allow names in the /Differences array to use the '#xy' notation, where 'xy' are hex digits. + . Significantly complemented character maps for the four Adobe predefined character sets. + . Text captures : changed the behavior of the definitions ; now, captured areas are + accessible by their page number, instead of a sequential index starting from zero. A capture is + defined for each page of the document, even if not in the list of applicable pages. Of course, empty + captures will be contained in the list if nothing was captured on the corresponding page. + + [Version : 1.6.2] [Date : 2017/05/17] [Author : CV] + . The adobe-charsets.map file was searched in the wrong directory. + + [Version : 1.6.1] [Date : 2017/05/12] [Author : CV] + . Text captures : + - Fixed a mistake where columns having empty values were not present in the object returned by the + GetCaptures() method. + . Complemented the table that maps Unicode special characters such as spaces, quotes, hyphens, etc. to + their Ascii counterpart (mainly for quotes) + . Enforced the ability of the class to work in 'degraded mode' whem some external tables are missing. + . Complemented the four Adobe standard character sets with more entity names, such as Polish characters, + Greek characters and special symbols (hundreds of symbols added). + . Moved the adobe-charsets.map file to the new Maps directory + . Created a Maps/unicode-to-ascii.map mapping file. + + [Version : 1.6.0] [Date : 2017/05/08] [Author : CV] + . Added the possibility to capture areas of text : + - The SetCaptures() and SetCapturesFromString() methods define the pages and areas of text within the + pages to be captured (see file README.md for more information). It can be used to define rectangle + shapes or line/columns information + - The GetCaptures() method returns an object containing the captured text areas + - Added the PDFOPT_LOOSE_X_CAPTURE and PDFOPT_LOOSE_Y_CAPTURE to include text that might exceed the + captured area, but whose top/left coordinates are included in the captured area. + . Complemented the Adobe 4 standard character set encodings, to include Polish characters. + . Exported the Adobe standard character sets to external file 'adobe-charsets.map'. + + [Version : 1.5.8] [Date : 2017/05/01] [Author : CV] + . Added undocumented aliases for stream encoding (for example, /Fl stands for /FlateDecode). + + [Version : 1.5.7] [Date : 2017/04/26] [Author : CV] + . Added the possibility to extract form data : + - Added the HasFormData(), GetFormCount() and GetFormData() methods + - Data extraction based on XML form templates, which maps form fields to human-readable ones + - Form data is returned as an object inherited from the PdfToTextFormData class + . Added the PDFOPT_DEBUG_SHOW_COORDINATES option, which shows coordinates of every text block in the + output text. This option has been designed for the future feature to be implemented, that will allow + to capture text areas. + . Added the Subject and Keywords properties regarding author information + . Changed the text/document_strxpos() methods to use the mb_strxpos() functions instead of strxpos(). + The supplied searched string must be encoded in UTF-8. + . Added the PdfToTextBase::GetStringParameter() method, which is able to retrieve parameter values such + as : + /FlagName (parameter value) + and : + /FlagName + + [Version : 1.5.6] [Date : 2017/04/21] [Author : CV] + . Added font metrics information for the Adobe Standard 14 fonts, which are hardcoded (in new directory + FontMetrics). This includes Times, Helvetica and Courier with their variations (bold, italic, etc.) + along with the Symbol and ZapDingbats fonts. Currently, font information relates to individual + character widths, which are used for page layout rendering. + . Enhanced layout rendering, which was giving strange results due to improper handling of certain + positioning instructions. + . Complemented the $UnicodeToSimpleAscii table to include special hyphens + + [Version : 1.5.5] [Date : 2017/04/20] [Author : CV] + . Added the $UnicodeToSimpleAscii table, which maps Unicode characters which can have an ASCII + equivalent. For example, German "fi" with ligature (U+FB01) becomes ASCII string "fi" ; special + spaces (such as unbreakable space) become an ASCII space (0x20), etc. + . Fixed a warning issued when a page entry does not contain a width and a height. + . Suppressed a warning in non-debug mode when an unsupported encryption algorithm has been found + + [Version : 1.5.4] [Date : 2017/04/07] [Author : CV] + . Fixed an issue with the /Kids flag, whose parameters are normally the ids of the objects containing + a page's contents. Sometimes, there is an additional indirection level : the parameter of the /Kids + flag is an obect which in turn contains the ids of page contents objects. Not handling this + situation caused some page contents to be missed. + . Some line breaks were not respected + + [Version : 1.5.3] [Date : 2017/04/06] [Author : CV] + . Enhanced page layout rendering : + - Better handle text positioning, especially when a line contains super/subscripted text + - Implemented additional text positioning instructions + - First experimental implementation of templates (which are correctly implemented in the non-page + layout version) + - Fixed a regression intoduced in versions 1.5.1 and 1.5.2. + . Completed some missing codes for the WinAnsiEncoding font, which is far bigger than stated in the + PDF specifications. + . Modified the default value of the ExtraTextWidth property to -5%, since most of the widths + computed by the GetStringWidth() method are a little bit too large. + + [Version : 1.5.2] [Date : 2017/04/05] [Author : CV] + . Page layout rendering enhancements : + - Interpret more instructions + - Fixed a problem where two lines, the first one having the biggest font, were joined together + . Fixed a bug in the Unescape() method which allowed octal sequences of more than 3 characters to be + interpreted, giving an incorrect character code as output. + . Some drawing objects were not recognized in the input ; this could affect page layout rendering. + . The document_strpos() methods were not returning the correct page number, but a zero-based offset + (the Pages property is indexed by the actual page number) + + [Version : 1.5.1] [Date : 2017/03/31] [Author : CV] + . Corrected a bug in the LZW decompression algorithm. + + [Version : 1.5.0] [Date : 2017/03/31] [Author : CV] + . Implemented page layout rendering !!! + - Text parts are displayed in the same order as Acrobat Reader displays them + - Spaces are inserted in the same line when necessary (the BlockSeparator property can be used + to separate items that are on the same line, at different -x-coordinates) + - The PDFOPT_RAW_LAYOUT and PDFOPT_BASIC_LAYOUT options have been added. The default is + PDFOPT_RAW_LAYOUT, which behaves as in the previous versions. The PDFOPT_BASIC_LAYOUT will + activate layout rendering. + - The new ExtraTextWidth property allows to adjust the computed text widths to help determine if + two consecutive blocks of text on the same line should be separated by a whitespace or not. + - Hold a separate set of instructions to be removed from PDF stream before interpretation, + depending on whether the page layout option has been activated or not. This is done not to impact + the performance of this class for callers that use it the traditional way. + This new feature allows to process more efficiently PDF files presenting tabular data or form data + (such as tickets reservations for example). Note that the PDFOPT_BASIC_LAYOUT option simply ensures + that items coming from the PDF file are shown in the correct order and not improperly concatenated ; + it does not visually reproduce what you could see with Acrobat Reader. + . Font descriptors were mistakenly interpreted as font entries. + + [Version : 1.4.19] [Date : 2017/03/25] [Author : CV] + . Added the PDFOPT_ENHANCED_STATISTICS flag (for debugging/optimization purposes) + . Added more useless instructions to be removed from the PDF input stream before processing + . Fixed a warning issued when processing certain objects not having stream data + . Fixed a warning issued when a character map does not contain a begincodespacerange/endcodespacerange + construct. + . Fixed a warning issued by the MapKids() method when a page catalog refers to a non-existing object. + . Re-established PCRE error handling during PDF file processing, since the pcre.backtrack_limit clearly + imposes a limit on the size of the data that can be captured (it does not only depends on the + complexity of the regular expression). This means that PDF file scanning will be stopping with an + exception if the size of the object to be captured exceeds such a limit. + . Fixed inappropriate warnings about undefined fonts + + [Version : 1.4.18] [Date : 2017/03/25] [Author : CV] + . Fixed a warning issued when author information in buggy PDFs referred to non-existing objects. + . The PageSeparator property was not taken into account by the GetPageFromOffset() method. + . Fixed a regression which caused an exception to be thrown if the PDF document did not exactly start + with '%PDF' + . Remove more useless instruction from the input stream before processing it (graphic-related + instructions). + . Trying to process object streams that contain invalid gzip data led to an infinite loop. + . Handle buggy PDF containing object streams which do not start with an even number of integer values + (this should normally be a list of object number/offset pairs) + . The CodePointToUtf8() function was running into an inifinite loop when the high order bit of the + supplied value was set (unsigned right-shift operator does not exist in PHP 5.*). + . Handle another kind of buggy PDF that have a page catalog referring to a non-existing object ; in this + case the behavior is the same as if there is no page catalog at all : everything is grouped onto a + single page. + + [Version : 1.4.17] [Date : 2017/03/21] [Author : CV] + . Drawing instructions between BX/EX were unduly removed, causing some text to be missing sometimes in + the output. + . Fixed an inappropriate property setting in class PdfTexterTimeoutException + . Handle the case where /XObjects contents are not inline but specify another object with the inline + contents. This caused some text to be missed in the output. + . When a Unicode font had a secondary character map (representing the /Differences array), the + secondary cmap was not searched if the character to be mapped was not also defined in the primary + cmap. This caused some mappings to be missed. + + [Version : 1.4.16] [Date : 2017/03/19] [Author : CV] + . Completely reviewed the way PDF objects are parsed ; the original code sometimes forced the user to + change the pcre.backtrack_limit PHP setting to abnormal values (14 000 000 for parsing the Adobe + PDF Specifications document itself !) + . Implemented the LZW decompression algorithm for text objects. + + [Version : 1.4.15] [Date : 2017/03/17] [Author : CV] + . Added the PDFOPT_IGNORE_HEADERS_AND_FOOTERS option. The previous behavior was to ignore them + systematically. + . Changed the PdfTexterFont object to handle basic encodings + . Refactored class for decryption support : + - Moved some functions to the PfTexterObjectBase object + - Created the PdfEncryptionData class to hold encryption data defined in the PDF file, by + transfering all the encryption-related properties from the PdfToText class to PdfEncryptionData + - Added the EncryptionData property, which currently has the "protected" visibility + - Renamed the IsPasswordProtected property to IsEncrypted + Note : processing of encrypted files is not yet functional + + [Version : 1.4.14] [Date : 2017/03/14] [Author : CV] + . Fixed a regression in text decoding when compound text contained angle brackets and right square + brackets (originally fixed in 1.4.12, but regressed in 1.4.13). + + [Version : 1.4.13] [Date : 2017/03/13] [Author : CV] + . Updated IDENTITY-H CID font mapping to add more characters. + . Handle the case where a character map states that it handles 2-bytes character codes (using the + begincoderange/endcoderange constructs) while the map itself only contains 1-byte character codes... + . Changed character maps that are secondary to a Unicode map to handle only the characters listed in + the /Differences parameter. + . Updated the offical Adobe WinAnsi character map to include UTF8 codes which have no Windows + equivalent (PdfTexterEncodingMap::$Encodings array). + + [Version : 1.4.12] [Date : 2017/03/12] [Author : CV] + . Added the LoadFromString() method, which allows to process PDF contents directly from a string. + . Changed the Load() method to be able to handle remote urls. + . Fixed a bug in the __next_token() method which caused contents following a regular square bracket + character to be wrongly interpreted. + . Integrated a modification resolving the interpretation of escaped octal sequences, that was missed + in version 1.4.11. + . Fixed : The new PdfToTextDecodingException class did not report the supplied error message. + + [Version : 1.4.11] [Date : 2017/03/11] [Author : CV] + . Corrected decoding of Ascii85 data (the original version from the 'unknown developer' did consider + that the '%' character was the start of a comment, which is not the case in Ascii85 encoded data) + . Handle the case where data using Ascii85 encoding contains in turn gzipped data + . Optimized the __decode_ascii85() method + . Characters using octal escape sequences were not correctly interpreted when followed by digits after + the 3rd one of the escape sequence. This so&metimes caused incorrect character decoding. + . Added timeout handling features ; when the PDFOPT_ENFORCE_EXECUTION_TIME or + PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME are specified, the $MaxExecutionTime property and + $MaxGlobalExecutionTime static property (respectively) will be used to prevent exceeding the PHP setting + 'max_execution_time'. If such a situation happens, a PdfToTextTimeoutException exception will be + thrown. + . Added the $MaxExtractedImages property, to limit the number of extracted images. + + [Version : 1.4.10] [Date : 2017/03/09] [Author : CV] + . Fixed an issue when decoding object streams : a lack of whitespace after the list of object number/ + offset pairs implied a shift of one character in object data extraction, making most of the objects + contained in the object stream to be missed. + + [Version : 1.4.9] [Date : 2017/03/09] [Author : CV] + . Handle the case where character mappings specified through the /Differences keyword can contain + constructs such as '/uniabcd', where 'abcd' is a sequence of hex digits representing a Unicode code + point. + . Found rare cases where a Unicode map mapped character IDs to a value between 0xF000 and 0xF0FF, which + do not correspond to any Unicode codepoint, but rather to an undocumented font. The class + PdfTexterAdobeUndocumentedFont has been designed to handle such mappings. + + [Version : 1.4.8] [Date : 2017/03/08] [Author : CV] + . The new PdfTexterAdobeMap classes did not specify the width of character codes, which caused bad + interpretation of characters expressed as octal escaped sequences. + . Fixed a regression, where the numeric value of octal escape sequences was displayed as is + . Fixed a warning issued when an empty date is specified in Author information + . Fixed a warning issued when instantianting objects of class PdfTexterFont with a missing 4th argument. + . Object streams (compound objects) are now discarded from the object list after processing their + embedded objects + . Fixed bad interpretation of font sizes less than 1 with no leading zero (eg, '.93') + . Better process text contents using characters escaped in octal notation + + [Version : 1.4.7] [Date : 2017/03/05] [Author : CV] + . First (and partial) implementation of CID fonts for the eastern Europe languages (currently tested on + Polish). + . Fixed a warning about an undefined $map variable in class PdfTexterIdentityHCIDFont. + + [Version : 1.4.6] [Date : 2017/03/04] [Author : CV] + . Fixed errors regarding undefined PdfTexterFont::$WinAnsiCharacterMap and $MacRomanCharacter map, which + had been moved to the PdfTexterAdobeWinAnsiMap and PdfTexterAdobeMacRomanMap classes, respectively. + + [Version : 1.4.5] [Date : 2017/03/05] [Author : CV] + . Implemented Cyrillic fonts (ISO-8859-5). + . Created the PdfTexterAdobe*Map classes, to implement the WinAnsi and Mac Roman encodings, instead of + implementing them as tables at the PdfTexterFont class level. + + [Version : 1.4.4] [Date : 2017/03/04] [Author : CV] + . Fixed several issues that affected text output : + - Completely changed the way "beginbfrange..endbfrange" constructs are handled ; some interpretation + problems occurred when line breaks were present at unexpected places, giving incorrect character + mappings. + - Unified the way escaped characters are handled in a text stream ; sometimes, the escaped character + appeared as is in the output, instead of being mapped. + - Handle more character escapes such as backspace, which has no equivalent in PHP. + . Handle the case where author information contains parentheses, which are also used as delimiters + + [Version : 1.4.3] [Date : 2017/03/03] [Author : CV] + . Handle the case where Unicode fonts can also have an associated /Encoding object, specifying a + /Differences flag that maps character ids to other character ids. + . The checkings performed on special /Contents references which reference an object holding the list + of the objects contained in the page was a little bit too relaxed and, in some rare cases, caused + a few text blocks to be missing in the output (this feature was introduced in version 1.4.1). + . Corrected a warning in method GetMappedFonts() + . For better performance, remove all useless instructions at the start and end of each drawing + instruction block. + . Started expanding the PdfTexterEncodingFont::$Encodings table to add character mappings with + character names not described in the PDF specifications. + . Better handle recognition of author information (sometimes, the hex2bin() function issued a warning) + + [Version : 1.4.2] [Date : 2017/03/02] [Author : CV] + . Fixed mapping issue when the same character map was referenced by several font definitions (only the + first definition was associated to the character map, which caused incorrect character mappings when + subsequent fonts were used). + . The regular expression catching author information was a little bit too greedy, causing some + misinterpretations and warnings in the output. + + [Version : 1.4.1] [Date : 2017/02/27] [Author : CV] + . Enhanced page contents decoding (sometimes, the parameter of the /Contents flags does not reference + objects containing drawing instructions, but objects containing the list of objects which in turn + contain the real drawing instructions). This caused in rare cases some contents to be missed in the + output. + . Changed the way compound objects are handled : instead of decoding them on-the-fly, preprocess them + before any other processing takes place. This ensure that forward references to objects defined later + in the pdf file will be satisfied. + . Ignore embedded images inside drawing instructions blocks ; the presence of gzipped data inside the + embedded image could cause misinterpretation of drawing instructions following it. For the current + version, embedded images will not be integrated into the image-extraction process. + . Fixed issue where the same group of text was extracted several times with some PDF samples. + . Better handle Japanese documents + + [Version : 1.4.0] [Date : 26/02/2017] [Author : CV] + . First implementation for handling languages written from right-to-left (RTL). + + [Version : 1.3.18] [Date : 25/02/2017] [Author : CV] + . Corrected the PdfTexterUnicodeMap class, which did not correctly decode character maps in files + generated on Apple where line-endings are carriage returns. As a result, output looked like garbage + data. + + [Version : 1.3.17] [Date : 25/02/2017] [Author : CV] + . Completely rewrote the way text specified between parentheses, either as plain text or 2-bytes + character codes, is processed. Some character values preceded with a backslash are escape sequences, + which were not recognized in all cases, thus causing a shift when interpreting 2-bytes values and a + bad mapping for the 2-bytes sequences that followed. + + [Version : 1.3.16] [Date : 2017/02/23] [Author : CV] + . Changed the GetFontByMapId() method which was incorrectly searching a global font instead of searching + first for a page-specific font. + . Compound objects were not correctly handled when object number/offset pairs were separated by a newline + (str_replace() was called instead of preg_replace). This caused some objects to be missed. + . Performed some optimizations which allow for a performance gain of 5 to 10 percent in certain cases. + . Removed the suppression of carriage returns, as this is the only line separator used by Adobe software + on Apple. + + [Version : 1.3.15] [Date : 2017/02/12] [Author : CV] + . Handles author information whose keyword values refer to existing object contents instead of + referring to a direct value + . Handles new constructions for beginbfchar/endbfchar constructs, which can act as beginbfrange ; + Example : + <21> <0009 0020 000d> + means : + . Map character #21 to #0009 + . Map character #22 to #0020 + . Map character #23 to #000D + There is no clue in the Adobe PDF specification that a single character could be mapped to a range. + The normal constructs would be : + <21> <0009> + <22> <0020> + <23> <0000D> + . Regular expressions matching Postscript instructions to be removed before interpreting the remaining + contents were sometimes catenating one instruction with the first parameter of the following one, + which caused bad interpretation, some warnings and bad handling of the layout (some lines could be + catenated together). + . Changed the MinSpaceWidth value from 250 to 200 (certain files separate words with lower spacing + values) + + [Version : 1.3.14] [Date : 2017/02/07] [Author : CV] + . Fixed the *_strpos methods which did not return correct page information any more + . Added new font aliases possibilities (/0 through /9 and /a through /z) + + [Version : 1.3.13] [Date : 2017/02/05] [Author : CV] + . Added the MaxSelectedPages property to extract only the first or last x pages of the document. + . Pure JPEG images are no more loaded into memory (using the gd library) when the + PDFOPT_AUTOSAVE_IMAGES flag is specified. + + [Version : 1.3.12] [Date : 2017/02/01] [Author : CV] + . Enhanced image extraction by adding support for more image formats, notably those having the + /FlateDecode flag. The new supported image formats are : + - Standard JPEG data, not initially specified as a real image + - Image data encoded as : + . RGB color values + . CMYK color values + . Gray scale color values + Currently, only 8-bits color components are supported. + . Added the PdfInlinedImage class, and changed the AddImage() and DecodeImage() methods to handle + these new image processing enhancements. . Corrected incorrect character mappings : the regex that matches beginbfrange such as : <32> <33> <100> also matched : <32> <33> [<57> <59] (!) as if it was specified as : <32> <33> <59> - This caused incorrect translation of characters sometimes. This seems to be a bug of the PCRE package ; - as a workaround, the regex matching the second form is tried first. - - [Version : 1.3.11] [Date : 2017/01/28] [Author : CV] - . Added the possibility to handle images encoded with the /FlateDecode/DCTDecode flags, which contain - gzipped JPEG data. Other types of images can also be encoded this way but in different formats, and - will be processed later. - . Added the PFOPT_AUTOSAVE_IMAGES flag to autosave images to external files without keeping them - into memory. - . The new property ImageAutoSaveFileTemplate give the template for the external file name when autosave - mode is enabled. - . The new property ImageAutoSaveFormat can be set to one of the IMG_* constants defined in the gd lib. - . Added the ImageCount property, which counts the number of images found in the document even if they - are not processed. - . Added the Output() method to the PdfImage class to display image contents on standard output - . Changed the PeekAuthorInformation() method to remove extraneous newlines when dealing with hex-encoded - data. - . Corrected the ExtractText() method which could generate divisions by zero in some cases. - . Added the PdfImage::DestroyImageResource() method, to free libgd memory. - - [Version : 1.3.10] [Date : 2017/01/11] [Author : CV] - . Prevented a warning in GetFontAttributes() when no font resource is defined (this typically happens - for pdf files where the text is graphically drawn and does not use font tables). - . Added a custom hex2bin() function for PHP versions < 5.4.0 - - [Version : 1.3.9] [Date : 2016/01/01] [Author : CV] - . Fixed warning messages produced when encountering font/map associations in objects with no stream - defined. - - [Version : 1.3.8] [Date : 2016/12/24] [Author : CV] - . Added one more place in the Load() method where to recognize associations between font aliases and - the object containing their definitions (some associations were "missed" in certain documents) - . Font specifiers containing a dot were not recognized (eg : /F1.0). - . Corrected a few warnings issued for certain files containing CID fonts - - [Version : 1.3.7] [Date : 2016/12/07] [Author : CV] - . The PdfTexterFontTable::MapCharacter() method was generating notices for fonts without character maps. - . The regular expression in the PdfToText::Load() method was sometimes confusing PCRE when trying to - match stream/endstream constructs because it contained [backslash]r and [backslash]n. This caused - some objects to be missed from the input PDF file. They have been replaced with [backslash]s, and - carriage returns/line feeds are removed later from the beginning of the stream data. - - [Version : 1.3.6] [Date : 2016/12/02] [Author : CV] - . Started implementation of CID fonts (EXPERIMENTAL) : - - Added the PdfTexterCIDMap abstract class. - - Added the PdfTexterIdentityHMap class, to implement the IDENTITY-H CID font. - . CID tables are externalized and located into the directory pointed to by the PdfToText::CIDTablesDirectory - public static property. Currently, only the IDENTITY-H CID font is (partially) implemented. - . Usual behavior remains for inexisting CID substitution tables : garbage will be produced. - - [Version : 1.3.5] [Date : 2016/12/02] [Author : CV] - . Now handles references to font aliases that are local to a page. Previously, font aliases were - considered as global to the document, which caused some incorrect character substitutions. - . Throw an exception if the mbstring extension is not loaded. - . Compatibility with PHP versions prior to 5.6 : - - The memory_get_usage() and memory_get_peak_usage() functions are used only if implemented (they were - implemented far later on Windows than on Unix). If not available, the MemoryUsage and MemoryPeakUsage - properties will be set to zero. - . Fixed : Offsets specified between character strings were incorrectly interpreted, which sometimes - caused groups of characters to be catenated together. - - [Version : 1.3.4] [Date : 2016/11/11] [Author : CV] - . The PdfToText class is now compatible with PHP versions < 5.6 - . Changed errors to warnings about unimplemented features. - . Corrected a warning issued by the Load() function when looping through page contents : some of them - were NULL, instead of being an array, because of the modifications included in template processing - in version 1.3.2 (related function : PdfTexterPageMap::MapKids). - - [Version : 1.3.3] [Date : 2016/11/05] [Author : CV] - . Allow the %PDF tag that signals the start of the PDF document to be located anywhere in the file, - even if preceded with garbage (Acrobat Reader is able to open such files) - . Added the $DocumentStartOffset property to indicate the real start of the document - . Add the $Statistics property with the following entries : - - 'TextSize' : total size of drawing instructions (text objects) - - 'OptimizedTextSize' : total size of drawing instructions, after removing the ones that are - useless for text extraction - . Added new regular expressions to remove useless drawing instructions - . __strip_useless_instructions() method : removed carriage returns to simplify regular expressions and - added a second preg_match() to remove single-line instructions not processed by the first one - . Handle the case where author information is expressed in UTF-16 with a BOM. - . Handle the case where author information is expressed as a series of hex digits. - - [Version : 1.3.2] [Date : 2016/11/02] [Author : CV] - . Template references can reference an object, which in turn has its own template references. - Changed the ProcessTemplateReferences() method to recursively handle such a situation. - . Reset internal structures at the end of the Load() method to save memory usage - . For backward compatibility with PHP versions < 5.2.11 where the mb_convert_encoding() function did not - recognize hexadecimal HTML entities, changed the CodePointToUtf8() method to use only HTML entities - expressed as decimal values. - - [Version : 1.3.1] [Date : 2016/10/30] [Author : CV] - . Author data was not correctly extracted in some cases. - . The ProcessTemplateReferences() issued some warnings in some cases, when no page structure is defined - by the document - - [Version : 1.3.0] [2016/10/27] [Author : CV] - . Added support for indirect object references in text drawing instructions. Such references may be of - the form /Tplx, and are further described as /XObjects within the PDF file. Without this, some parts - of the text contained in the PDF file will be completely missed. - . Added the MemoryUsage and MemoryPeakUsage properties, that give the difference between the memory - occupied at the start and at the end of the Load() method. Note that this will not give the maximum - amount of memory that has been occupied at a given time. - - [Version : 1.2.52] [Date : 2016/10/23] [Author : CV] - . Although page header and footer contents are not yet publicly available, they are internally extracted - and removed from page contents. The regular expression that captured such data was too greedy, and - caused regular page contents to be mistakenly interpreted as header or footer contents. - - [Version : 1.2.51] [Date : 2016/10/22] [Author : CV] - . Load() method : Added a comprehensive coverage of errors that may be returned by preg_match_all() - when called for extracting obj/endobj constructs (some PDF files may lead pcre functions to reach the - pcre.recursion_limit or pcre_backtrack_limit settings of php.ini). - - [Version : 1.2.50] [Date : 2016/10/19] [Author : CV] - . In the Adobe Postscript language, displaying text such as "(my car)" requires the left and right - parentheses to be escaped escaped with a backslash. The class did not handle the case where the - current font uses two-bytes characters, including the backslash itself : in such cases, the backslash - was recognized as a normal character and, since escaped characters are always represented with one byte, - the remaining input was shifted by one byte, giving most frequently far-east Unicode characters, - such as Chinese. Thus, "(my car)" produced the string "\" followed by Unicode garbage. - - [Version : 1.2.49] [Date : 2016/10/15] [Author : CV] - . Font name specifiers (/Fx, /TTy, /Rz etc.) were processed differently by the IsFontMap() method (used - to recognize an object that has font specifier/pdf object associations), the AddFontMap() method - (which adds a font map to the internal font table) and the __next_instruction() method (which tries - to recognize font specifiers within Postscript instructions). Such differences in the way of handling - font specifiers led to discrepancies in the output text, with regards to the original text. - . Added the PdfToTextBase::$FontSpecifiers, which is now the regular expression used throughout this - class to recognize a font specifier. - . Added the /OPBaseFont and /OPSUFont font specifiers (used by Ranx Xerox scanners). - . Added the PdfTexterPageMap::GetResourceMappings() method. - - [Version : 1.2.48] [Date : 2016/10/14] [Author : CV] - . Some pages could not be extracted correctly from PDF documents including nested page content - descriptions (ie, pages leading to another object listing in turn the objects that describe the page, - instead of directly leading to the object that describe the page). - - [Version : 1.2.47] [Date : 2016/09/13] [Author : CV] - . Changed the licensing model from GPL TO LGPL. - . Specialized the PdfImage class into subclasses. The only available subclass for now is PdfJpegImage. - . Added the $EncryptMetadata property, coming from encyption information present in the PDF file. - - [Version : 1.2.46] [Date : 2016/08/24] [Author : CV] - . A number of inconsistencies was found in the chain of MapCharacter() methods up to array access - on a PdfTexterCharacterMap object. Sometimes, a UTF8 string was returned, and sometimes it was a - Unicode code point. This conducted to bad character mappings, especially on files generated with - PrimoPdf. - . Added the Title property, coming from author information - . Added a few regular expressions to remove unprocessed drawing instructions in the - $IgnoredInstructionsTemplates array, to reduce the number of instructions processed. - . Fixed a few caching issues about character maps - - [Version : 1.2.45] [Date : 2016/08/22] [Author : CV] - . Text objects containing page header/footer drawing instructions were unduly discarded, even if they - contained normal text. Added the ExtractTextData() method to handle such cases. - - [Version : 1.2.44] [Date : 2016/08/21] [Author : CV] - . Temporarily disabled processing of far-east characters specified as plain text ( "(xy)" ) instead of - hex string ( "" ). This was causing problems with PDF files that really use plain-text (mostly - causing Chinese characters to be displayed instead of strings using the European alphabet). - - [Version : 1.2.43] [Date : 2016/08/21] [Author : CV] - . Enhanced handling of fonts not using Unicode character maps : - - Added support for the "/gxx" notation for the /Differences tag, where "xx" is a character number. - - Characters not listed in the /Differences tag were not using the standard Adobe encoding maps - . Added support for password-protected files (note that the current version is not able to decrypt - files yet) : - - Added the $user_password and $owner_password parameters to the class constructor and the Load() - method. - - Added the ID/ID2 readonly property, which comes from the unique file identifier extracted - from the file contents. - - Added the UserPassword, OwnerPassword and IsPasswordProtected properties. - - Added the GetTrailerInformation() and Decrypt() methods - - Added the Encryption* properties - - [Version : 1.2.42] [Date : 2016/08/12] [Author : CV] - . The /R font alias was no more recognized, which caused bad character translations. - . Some characters were not properly encoded into UTF8, for blocks of text using the internal Adobe - Windows Ansi and Mac Roman character sets. - . Rearranged some property initialization values that caused syntax errors for PHP versions < 5.6. - . Fixed some y-positioning issues when relative "Td" instructions are used. This prevented line breaks - to be inserted when necessary. - . Temporarily commented out lines of code which were trying to interpret x-position : they were - unnecessarily inserting spaces inside words written in multiple chunks - . Fix : 2-bytes codes can not only specified as hex digits, but also as ascii characters. Eg : "bh" - means : 0x6268.Ths caused for example Chinese characters to be wrongly interpreted on a document - generated from OpenOffice to PrimoPdf. - - [Version : 1.2.41] [Date : 2016/08/10] [Author : CV] - . Series of hex digits that represent characters and are related to unmapped fonts using the WinAnsi or - MacRoman encoding scheme where inappropriately split into chunks of 4 digits instead of 2. This - caused in some occasions normal characters to be interpreted as far-east languages, such as Chinese. - - [Version : 1.2.40] [Date : 2016/08/09] [Author : CV] - . Changed the way the PdfToText::$CharacterClass array is initialized. It was using constructs such as : - [ 'a' => self::CTYPE_ALNUM | self::CTYPE_XDIGIT, ... ] - which is authorized only for PHP versions >= 5.6. - - [Version : 1.2.39] [Date : 2016/08/09] [Author : CV] - . Entirely rewrote the PdfTexterPageMap class, which was incorrectly handling nested page descriptions. - . In the Load() method, changed the way text is extracted from objects : instead of starting from the - list of available text objects and trying to retrieve their page number, the method now loops through - page numbers (defined in the PageMap object property) and use the associated text object ids to - extract their contents. - - [Version : 1.2.38] [Date : 2016/08/09] [Author : CV] - . Bug fix : The PdfTexterFont::MapCharacter() method was not modified after the transition to a better - Unicode-to-UTF8 translation ; it was still accepting characters, while it should have been accepting - integer values (character codes). - . Bug fix : unwanted headers and footers were not recognized appropriately in some cases. - - [Version : 1.2.37] [Date : 2016/08/08] [Author : CV] - . (optimization) Checking against header or footer data is now made in the ExtractText() method, instead - of __next_instruction(), which caused too many calls to the preg_match() function. - . Added the IsPageHeaderOrFooter() method. - . Bug fix : Positive offsets between two text groups were unduly taken into account for the number of - spaces to be inserted between those groups (negative offsets add spacing, while positive ones are - subtracted from the current x-position). - . Bug fix : Space insertion for relative x-positioning did not take into account the last x position. - - [Version : 1.2.36] [Date : 2016/08/07] [Author : CV] - . Added the PDFOPT_NO_HYPHENATED_WORDS option to remove hyphens that break words on two lines. - . (optimization) Introduced a static array giving the character class for some characters - (alpha, digit, etc.) - . Bug fix : characters present in plain text were translated to Ascii NUL - . Bug fix : the __next_token() function was also returning the next character after character codes - specified within angle brackets ("<>"), which caused extra NUL values to be displayed in the output. - - [Version : 1.2.35] [Date : 2016/08/06] [Author : CV] - . (optimization) Reduced the number of times author information is scanned in pdf objects. - . (optimization) Removed useless calls to str_pad(), strcasecmp() and substr(). - . Optimized the __next_token() method - - [Version : 1.2.34] [Date : 2016/08/05] [Author : CV] - . Reduced the number of calls to certain built-in functions (ctype_* functions) - . Font maps were stored using only the number part of their specification (eg, "1_0" for "/C1_0"). This - led to override existing fonts using different notations (ie, "/C1_0" will override an existing font - map that was declared using "/T1_0"). The consequence is that sometimes, the charater map used to - translate text was not the appropriate one, hence some badly displayed characters. - . Fixed an "uninitialized string offset" PHP notice in the __next_token() method. - . Fixed some cases where too many line breaks were inserted between two lines. - - [Version : 1.2.33] [Date : 2016/08/05] [Author : CV] - . For optimization reasons, reduced the number of times certain methods were called (GetMapWidth, - PeekAuthorInformation, IsMapped, GetFontByMapId, __get_character_padding, ...) - . Removed a regular expression for reducing drawing contents size which was a little bit too greedy - and caused some characters to be removed from plain text. - - [Version : 1.2.32] [Date : 2016/08/05] [Author : CV] - . Optimized regular expressions that remove useless Adobe Postscript instructions and added new ones. - . Rewrote the CodePointToUtf8() method. - . Handled a new way to specify font aliases : /TTx. - - [Version : 1.2.31] [Date : 2016/08/04] [Author : CV] - . Removed irrelevant Postscript instructions from text streams (such as graphical drawing instructions), - to reduce the work of the tokenizer (the __next_token() method), which is written in PHP and not as - efficient as a tokenizer written in C. - Removal is done using the preg_replace() function. - . Character translation results are now buffered, to avoid unnecessary calls to the MapCharacter() - method of the PdfTexterFontTable class. - - [Version : 1.2.30] [Date : 2016/08/02] [Author : CV] - . Handle object streams, which is a way to group several objects into a single pdf object (in the same - stream). The object flags are : /Type/ObjStm. This explains why certain paragraphs were missing in - certain PDF samples : they were simply "hidden" in object streams. - . The static variable PdfToText::$Utf8PlaceHolder can now include a format to be used for substitutions - of Unicode characters that cannot be translated ; one parameter will be passed to the sprintf() - function before putting it in the output text, the Unicode code point (an integer value). - . The default value for the PdfToText::$Utf8PlaceHolder, when in debug mode, is : - '[unknown character 0x%08X]' - Note that the $DEBUG static variable must be set BEFORE the first instantiation of a PdfToText object. - - [Version : 1.2.29] [Date : 2016/08/02] [Author : CV] - . Changed the way the PdfTexterUnicodeCharacterMap class handles character ranges to reduce memory usage - for PDF files defining numerous ranges in character maps. A sample that needed more than 128Mb of - memory now runs correctly with a memory limit of 32Mb. - . Corrected an incorrect reference to self::$DEBUG in PdfObjectBase class. - . Added the IsObjectStream() method. - - [Version : 1.2.28] [Date : 2016/08/01] [Author : CV] - . Better handle Unicode translations. Added the CodePointToUtf8() method. - . Added the static PdfToText::$Utf8Placeholder property, which is used when a Unicode character could - not be converted to an UTF8 string. - - [Version : 1.2.27] [Date : 2016/08/01] [Author : CV] - . Corrected a bad if() condition in the __next_token() method which caused the message 'Unitialized - string offset xxx' to sometimes occur. - . Added the PDFOPT_IGNORE_TEXT_LEADING option. This option must be used when you notice that an - unnecessary amount of empty lines are inserted between two text elements. This is the symptom that - the pdf file contains only relative positioning instructions combined with big values of text leading - instructions. - . For text fonts not having character maps, take into account the encoding specified in the font - attributes, such as WinAnsi or MacOsRoman, where some characters codes cannot be directly mapped to - Unicode characters (this was causing characters such as the Euro or (TM) signs to be incorrectly - translated). - - [Version : 1.2.26] [Date : 2016/07/30] [Author : CV] - . Throw an exception when an unsupported encoding format or when bad flate decoding data is encountered - only if self::$DEBUG is greater than 1. - . Added the EOL property, which is used for line breaks in the extracted text. Default is PHP_EOL. - . Some text constructs can contain a continuation line, such as : - (this is a sentence \ - split over two lines) - Removed the continuation line sequence, which caused unnecessary line break. - - [Version : 1.2.25] [Date : 2016/07/28] [Author : CV] - . Handled yet another way to specify a font resource : /C0_0, /C0_1, etc. It behaves like the /Fx and - /fx-y notations, in the sense they are a way to associate a font resource object with some kind of - alias (although the pdf specification only talks about /Fx). - - [Version : 1.2.24] [Date : 2016/07/28] [Author : CV] - . Decimal numbers not having a leading zero were not recognized as decimal numbers in text coordinates. - . Page maps using the /Kids and /Count flags can be nested ; the top-level /Kids page map contains the - sum of all the pages in its /Kids descendents for its /Count parameter. The warning signalling this - discrepancy has been disabled when not in debug mode, but the PdfTexterPageMap class will need to be - reviewed to handle this new situation. - - [Version : 1.2.23] [Date : 2016/07/27] [Author : CV] - . Added the following properties : Author, CreatorApplication, ProducerApplication, CreationDate and - ModificationDate - . Added the PeekAuthorInformation() internal method to retrieve the values of the above properties, if - present. - . Added the GetUTCDate() method to the PdfObjectBase class to reformat dates from Adobe UTC format to - an UTC format that can be understood by the strtotime() function (some dates may for example be - specified in the following format : 20160707182114+02'00', where the '00' string is not recognized) - - [Version : 1.2.22] [Date : 2016/07/26] [Author : CV] - . The BlockSeparator property was not used is some cases, which caused certain data presented in a - certain format by certain Pdf generators to appear catenated. - - [Version : 1.2.21] [Date : 2016/07/26] [Author : CV] - . Changed the Unescape() method which did not handle at all character specifications using the octal - notation. - - [Version : 1.2.20] [Date : 2016/07/24] [Author : CV] - . When encountering an unrecognized FlateDecode stream, throws an exception only if the $DEBUG global - variable is non-false, otherwise ignores the stream data. - This is a temporary measure until I find out how to properly decode such encoded streams correctly. - - [Version : 1.2.19] [Date : 2016/07/19] [Author : CV] - . The class do not process any more image contents by default. The following flags can now be specified - to the constructor if image data is to be retrieved : - . PDFOPT_GET_IMAGE_DATA : - Will put raw (undecoded) image data in the new $ImageData[] array property. - . PDFOPT_DECODE_IMAGE_DATA : - Will use the graphics glib library to create a jpeg resource from the raw data - encountered in the PDF stream. Specifying this flag automatically enables the - PDFOPT_GET_IMAGE_DATA flag. - . The new $ImageData property is an array of associative arrays that contains the following entries : - . 'type' - - Image type. Can be one of the following : - . 'jpeg' - - Jpeg image type. - Note that in the current version, only jpeg images are processed, until I find the - method to decode other proprietary Adobe formats. - . 'data' - - Raw image data. - - [Version : 1.2.18] [Date : 2016/07/05] [Author : CV] - . For debugging purposes, added the $object_id parameter to the DecodeData() method. - . The DecodeData() method now throws an exception if the stream object does not contain valid gzip data - . Handled the case of empty streams (!), such as : - 18 0 obj - << - /Filter /FlateDecode - /Length 0 - >> - stream - - endstream - endobj - which was causing warnings from the gzuncompress() function. - - [Version : 1.2.17] [Date : 2016/07/02] [Author : CV] - . Avoided processing of empty text blocks, which were causing extraneous line breaks in the output - . For debugging purposes, added a 'token' element in the associative array returned by the - __next_instruction() method. - . Made the difference between absolute positioning instructions ("Tm") and relative ones ("Td" and "TD") - which are often used in tabular data, by introducing the $last_relative_goto_y variable in the - ExtractText() method (individual cell contents were broken into separate lines). - - [Version : 1.2.16] [Date : 2016/07/01] [Author : CV] - . No line break was inserted when relative positioning instructions (Td and TD) were encountered. This - caused consecutive lines to be joined together. - - [Version : 1.2.15] [Date : 2016/06/30] [Author : CV] - . Corrected the __extract_chars_from_array() methods, which incorrectly handled escaped characters in - text groups and ate up the next character following the escaped one. For example, the following group : - [(3)-3(4)-3(.)11(5)-3(\(f\) a)9(n)4(d)-3( 3)6(4)-3(.6\()6(g)-3(\)\()8(2)4(\), )4(t)-3(h)-3(a)8(t)] - was represented as : - 34.5(f)nd 34.6(g)2)that - instead of : - 34.5(f) and 34.6(g)(2), that - - [Version : 1.2.14] [Date : 2016/06/24] [Author : CV] - . Added the __get_character_padding() method to compute the number of spaces needed between two - chunks of characters, taking into account the MinSpaceWidth property. - - [Version : 1.2.13] [Date : 2016/06/23] [Author : CV] - . Took into account the relative x-offset specified with Td/TD instructions - . Added the MinSpaceWidth property, which is to be measured in thousands of text units, to help the - class determine if spaces should be inserted between two character units. The default value is 250. - Although the value can be less than 1000, only a multiple of 1000 units will determine the total - number of spaces to be inserted if the PDF_REPEAT_SEPARATOR flag is set in the Options property. - . Relaxed a little bit the cases where a newline should be inserted - . Don't add the BlockSeparator string if the Separator and BlockSeparator properties are the same, and - the current result ends with the Separator string. When both properties are set to a space, this - avoids inserting double space between column elements. - - [Version : 1.2.12] [Date : 2016/06/21] [Author : CV] - . Better handle relative positioning instructions so that text parts supposed to be on the same line - stay on the same line. - . Added the PageSeparator property - - [Version : 1.2.11] [Date : 2016/06/19] [Author : CV] - . Renamed the "Separator" property to "BlockSeparator" - . The "Separator" property is now used as a separator for notations such as : - [(1)-1000(2)] - where "-1000" is a value that is subtracted to the current x-position. Some pdf documents presenting - tabular data use this characteristic to separate text in columns. The default value is " " (white - space). - . Added the PDFOPT_* option constants, which can either be specified to the class constructor or changed - by setting the new "Options" property. The only flag available for now is PDFOPT_REPEAT_SEPARATOR, which - has an interest if the offset between two text chunks is less than -2000 ; for example, the following - construct : - - [(1)-2000(2)] - - will give the string "1 2" if the PDF_REPEAT_SEPARATOR flag is not set, and "1 2" if set (assuming - the Separator property is set to a space). - - [Version : 1.2.10] [Date : 2016/06/16] [Author : CV] - . The character after an octal notation was skipped. For example, (\101 X) was rendered as "AX" instead - of "A X". - - [Version : 1.2.9] [Date : 2016/06/15] [Author : CV] - . Array of characters which included line breaks were not correctly interpreted - . Added the Separator property, which can be used to separate chunks of text that are recognized to be - on the same line. This is useful for pdf documents that contain mainly tabular data, but it could - break words if it contains textual data. - . Handled a new strange way to specify font numbers (/f-x-y instead of /Fx). - - [Version : 1.2.8] [Date : 2016/06/12] [Author : CV] - . Corrected the visibility of the Isxxx() methods, which were public instead of protected. - . Some positioning instructions can be cumulated (a 'Tm' can be followed by a 'Td'). Consider that the - last instruction wins. - - [Version : 1.2.7] [Date : 2016/06/11] [Author : CV] - . Added the Images array property, which makes available the images found in the document. The elements - of this array are image data. - . Added a few PDF_*_ENCODING constants which were missing. Not all encoding types have been implemented, - however. - . Added the IsImage() and DecodeImage() methods. - . Added the PdfImage class - . To simplify the management of this source between the Thrak framework and the specific version made - for publishing on phpclasses, added the following : - - error() and warning() functions - - PdfToTextException class - - [Version : 1.2.6] [Date : 2016/06/08] [Author : CV] - . Stream/endstream contents can be unencoded and appear in clear text ! Added the PDF_TEXT_ENCODING - constant to handle this case. - . Changed the regular expression in IsFontMap() to allow spaces between "<<" and the first "/F". - . Character maps strike again ! after the issue uncovered in version 1.2.2, where constructs such as : - - <012B> <00660067> - - means "replace every reference to 0x0012B with unicode characters 0x0066 and 0x0067 (for maps having - a width of 2 bytes), I discovered that some pdf documents having character maps of 1 byte could hold - entries such as : - - <03> <0020> - - which simply means "replace every reference to 0x03 with character 0x20"... I have not seen any - differentiating factor between the sample I handled in version 1.2.2 and this one. All I can say is - that I put a horrible kludge in PdfTexterUnicodeMap::offsetGet(), to handle a situation were - character widths are one-byte long, and their substitutions can be 2-bytes long, with a leading byte - of zero. S..t. - - [Version : 1.2.5] [Date : 2016/06/07] [Author : CV] - . Tried to enhance performance by first looking for objects that contain stream/endstream constructs, - to avoid unnecessary detections of character maps, font definitions, etc. - . Consecutive text shapes introduced by the "Do" instruction were gathered on the same text line. A - line break is now inserted when a "Do" instruction is encountered. - - [Version : 1.2.4] [Date : 2016/06/03] [Author : CV] - . Introduced the PdfObjectBase class, from which all the classes defined here inherit. Moved all general - methods at this level. - . Added the PdfPageMap class - - [Version : 1.2.3] [Date : 2016/06/01] [Author : CV] - . Found a PDF coming from MAC outer galaxies where some lines were terminated by "\r\n", and some other - by "\r". - . Added more debugging messages when the $DEBUG static class variable is set to an integer value - greater than 1. - . Character references to a CMAP can be specified as \xyz, where "xyz" are octal digits - . The begincmap/endcmap constructs can be omitted, which initially was the criteria to determine if the - current object is a character map. Checked for the presence of beginbfchar/beginbfrange in this case. - - [Version : 1.2.2] [Date : 2016/05/31] [Author : CV] - . Modified the PdfTexterUnicodeMap class to handle cases where substitutions in beginbfchar/endbfchar - constructs contains several characters. For example : - - <012B> <00660067> - - which means that a reference to character #012B must be substituted with #0066 and #0067. - - [Version : 1.2.1] [Date : 2016/05/28] [Author : CV] - . Changed the regular expression to match stream/endstream constructs because it captured too much as - in the following example : - - << ... /Type /Stream >> - stream - ... - endstream - - (the captured data was : " >>\nstream..."). Now a stream construct is detected if not preceded by a - slash. - . Found one case where the beginbfchar/enbfchar and all its contents were put on one line, thus making - the regular expression for capturing characters to fail. Hope this will not happen with beginbfrange... - - [Version : 1.1] [Date : 2016/05/21] [Author : CV] - . Added support to retrieve the page number associated to a character offset in the Text contents. New - methods are : - - GetPageFromOffset - - text_strpos/text_stripos - - document_strpos/document_stripos - - text_match/document_match - - [Version : 1.0.1] [Date : 2016/05/12] [Author : CV] - . Added code to ignore page headers and footers, which caused unnecessary newlines to be added to the - output (handling page headers and footers would require to break the code). - . The last y-position was not correctly tracked in some cases. - - [Version : 1.0] [Date : 2016/04/16] [Author : CV] - Initial version. + This caused incorrect translation of characters sometimes. This seems to be a bug of the PCRE package ; + as a workaround, the regex matching the second form is tried first. + + [Version : 1.3.11] [Date : 2017/01/28] [Author : CV] + . Added the possibility to handle images encoded with the /FlateDecode/DCTDecode flags, which contain + gzipped JPEG data. Other types of images can also be encoded this way but in different formats, and + will be processed later. + . Added the PFOPT_AUTOSAVE_IMAGES flag to autosave images to external files without keeping them + into memory. + . The new property ImageAutoSaveFileTemplate give the template for the external file name when autosave + mode is enabled. + . The new property ImageAutoSaveFormat can be set to one of the IMG_* constants defined in the gd lib. + . Added the ImageCount property, which counts the number of images found in the document even if they + are not processed. + . Added the Output() method to the PdfImage class to display image contents on standard output + . Changed the PeekAuthorInformation() method to remove extraneous newlines when dealing with hex-encoded + data. + . Corrected the ExtractText() method which could generate divisions by zero in some cases. + . Added the PdfImage::DestroyImageResource() method, to free libgd memory. + + [Version : 1.3.10] [Date : 2017/01/11] [Author : CV] + . Prevented a warning in GetFontAttributes() when no font resource is defined (this typically happens + for pdf files where the text is graphically drawn and does not use font tables). + . Added a custom hex2bin() function for PHP versions < 5.4.0 + + [Version : 1.3.9] [Date : 2016/01/01] [Author : CV] + . Fixed warning messages produced when encountering font/map associations in objects with no stream + defined. + + [Version : 1.3.8] [Date : 2016/12/24] [Author : CV] + . Added one more place in the Load() method where to recognize associations between font aliases and + the object containing their definitions (some associations were "missed" in certain documents) + . Font specifiers containing a dot were not recognized (eg : /F1.0). + . Corrected a few warnings issued for certain files containing CID fonts + + [Version : 1.3.7] [Date : 2016/12/07] [Author : CV] + . The PdfTexterFontTable::MapCharacter() method was generating notices for fonts without character maps. + . The regular expression in the PdfToText::Load() method was sometimes confusing PCRE when trying to + match stream/endstream constructs because it contained [backslash]r and [backslash]n. This caused + some objects to be missed from the input PDF file. They have been replaced with [backslash]s, and + carriage returns/line feeds are removed later from the beginning of the stream data. + + [Version : 1.3.6] [Date : 2016/12/02] [Author : CV] + . Started implementation of CID fonts (EXPERIMENTAL) : + - Added the PdfTexterCIDMap abstract class. + - Added the PdfTexterIdentityHMap class, to implement the IDENTITY-H CID font. + . CID tables are externalized and located into the directory pointed to by the PdfToText::CIDTablesDirectory + public static property. Currently, only the IDENTITY-H CID font is (partially) implemented. + . Usual behavior remains for inexisting CID substitution tables : garbage will be produced. + + [Version : 1.3.5] [Date : 2016/12/02] [Author : CV] + . Now handles references to font aliases that are local to a page. Previously, font aliases were + considered as global to the document, which caused some incorrect character substitutions. + . Throw an exception if the mbstring extension is not loaded. + . Compatibility with PHP versions prior to 5.6 : + - The memory_get_usage() and memory_get_peak_usage() functions are used only if implemented (they were + implemented far later on Windows than on Unix). If not available, the MemoryUsage and MemoryPeakUsage + properties will be set to zero. + . Fixed : Offsets specified between character strings were incorrectly interpreted, which sometimes + caused groups of characters to be catenated together. + + [Version : 1.3.4] [Date : 2016/11/11] [Author : CV] + . The PdfToText class is now compatible with PHP versions < 5.6 + . Changed errors to warnings about unimplemented features. + . Corrected a warning issued by the Load() function when looping through page contents : some of them + were NULL, instead of being an array, because of the modifications included in template processing + in version 1.3.2 (related function : PdfTexterPageMap::MapKids). + + [Version : 1.3.3] [Date : 2016/11/05] [Author : CV] + . Allow the %PDF tag that signals the start of the PDF document to be located anywhere in the file, + even if preceded with garbage (Acrobat Reader is able to open such files) + . Added the $DocumentStartOffset property to indicate the real start of the document + . Add the $Statistics property with the following entries : + - 'TextSize' : total size of drawing instructions (text objects) + - 'OptimizedTextSize' : total size of drawing instructions, after removing the ones that are + useless for text extraction + . Added new regular expressions to remove useless drawing instructions + . __strip_useless_instructions() method : removed carriage returns to simplify regular expressions and + added a second preg_match() to remove single-line instructions not processed by the first one + . Handle the case where author information is expressed in UTF-16 with a BOM. + . Handle the case where author information is expressed as a series of hex digits. + + [Version : 1.3.2] [Date : 2016/11/02] [Author : CV] + . Template references can reference an object, which in turn has its own template references. + Changed the ProcessTemplateReferences() method to recursively handle such a situation. + . Reset internal structures at the end of the Load() method to save memory usage + . For backward compatibility with PHP versions < 5.2.11 where the mb_convert_encoding() function did not + recognize hexadecimal HTML entities, changed the CodePointToUtf8() method to use only HTML entities + expressed as decimal values. + + [Version : 1.3.1] [Date : 2016/10/30] [Author : CV] + . Author data was not correctly extracted in some cases. + . The ProcessTemplateReferences() issued some warnings in some cases, when no page structure is defined + by the document + + [Version : 1.3.0] [2016/10/27] [Author : CV] + . Added support for indirect object references in text drawing instructions. Such references may be of + the form /Tplx, and are further described as /XObjects within the PDF file. Without this, some parts + of the text contained in the PDF file will be completely missed. + . Added the MemoryUsage and MemoryPeakUsage properties, that give the difference between the memory + occupied at the start and at the end of the Load() method. Note that this will not give the maximum + amount of memory that has been occupied at a given time. + + [Version : 1.2.52] [Date : 2016/10/23] [Author : CV] + . Although page header and footer contents are not yet publicly available, they are internally extracted + and removed from page contents. The regular expression that captured such data was too greedy, and + caused regular page contents to be mistakenly interpreted as header or footer contents. + + [Version : 1.2.51] [Date : 2016/10/22] [Author : CV] + . Load() method : Added a comprehensive coverage of errors that may be returned by preg_match_all() + when called for extracting obj/endobj constructs (some PDF files may lead pcre functions to reach the + pcre.recursion_limit or pcre_backtrack_limit settings of php.ini). + + [Version : 1.2.50] [Date : 2016/10/19] [Author : CV] + . In the Adobe Postscript language, displaying text such as "(my car)" requires the left and right + parentheses to be escaped escaped with a backslash. The class did not handle the case where the + current font uses two-bytes characters, including the backslash itself : in such cases, the backslash + was recognized as a normal character and, since escaped characters are always represented with one byte, + the remaining input was shifted by one byte, giving most frequently far-east Unicode characters, + such as Chinese. Thus, "(my car)" produced the string "\" followed by Unicode garbage. + + [Version : 1.2.49] [Date : 2016/10/15] [Author : CV] + . Font name specifiers (/Fx, /TTy, /Rz etc.) were processed differently by the IsFontMap() method (used + to recognize an object that has font specifier/pdf object associations), the AddFontMap() method + (which adds a font map to the internal font table) and the __next_instruction() method (which tries + to recognize font specifiers within Postscript instructions). Such differences in the way of handling + font specifiers led to discrepancies in the output text, with regards to the original text. + . Added the PdfToTextBase::$FontSpecifiers, which is now the regular expression used throughout this + class to recognize a font specifier. + . Added the /OPBaseFont and /OPSUFont font specifiers (used by Ranx Xerox scanners). + . Added the PdfTexterPageMap::GetResourceMappings() method. + + [Version : 1.2.48] [Date : 2016/10/14] [Author : CV] + . Some pages could not be extracted correctly from PDF documents including nested page content + descriptions (ie, pages leading to another object listing in turn the objects that describe the page, + instead of directly leading to the object that describe the page). + + [Version : 1.2.47] [Date : 2016/09/13] [Author : CV] + . Changed the licensing model from GPL TO LGPL. + . Specialized the PdfImage class into subclasses. The only available subclass for now is PdfJpegImage. + . Added the $EncryptMetadata property, coming from encyption information present in the PDF file. + + [Version : 1.2.46] [Date : 2016/08/24] [Author : CV] + . A number of inconsistencies was found in the chain of MapCharacter() methods up to array access + on a PdfTexterCharacterMap object. Sometimes, a UTF8 string was returned, and sometimes it was a + Unicode code point. This conducted to bad character mappings, especially on files generated with + PrimoPdf. + . Added the Title property, coming from author information + . Added a few regular expressions to remove unprocessed drawing instructions in the + $IgnoredInstructionsTemplates array, to reduce the number of instructions processed. + . Fixed a few caching issues about character maps + + [Version : 1.2.45] [Date : 2016/08/22] [Author : CV] + . Text objects containing page header/footer drawing instructions were unduly discarded, even if they + contained normal text. Added the ExtractTextData() method to handle such cases. + + [Version : 1.2.44] [Date : 2016/08/21] [Author : CV] + . Temporarily disabled processing of far-east characters specified as plain text ( "(xy)" ) instead of + hex string ( "" ). This was causing problems with PDF files that really use plain-text (mostly + causing Chinese characters to be displayed instead of strings using the European alphabet). + + [Version : 1.2.43] [Date : 2016/08/21] [Author : CV] + . Enhanced handling of fonts not using Unicode character maps : + - Added support for the "/gxx" notation for the /Differences tag, where "xx" is a character number. + - Characters not listed in the /Differences tag were not using the standard Adobe encoding maps + . Added support for password-protected files (note that the current version is not able to decrypt + files yet) : + - Added the $user_password and $owner_password parameters to the class constructor and the Load() + method. + - Added the ID/ID2 readonly property, which comes from the unique file identifier extracted + from the file contents. + - Added the UserPassword, OwnerPassword and IsPasswordProtected properties. + - Added the GetTrailerInformation() and Decrypt() methods + - Added the Encryption* properties + + [Version : 1.2.42] [Date : 2016/08/12] [Author : CV] + . The /R font alias was no more recognized, which caused bad character translations. + . Some characters were not properly encoded into UTF8, for blocks of text using the internal Adobe + Windows Ansi and Mac Roman character sets. + . Rearranged some property initialization values that caused syntax errors for PHP versions < 5.6. + . Fixed some y-positioning issues when relative "Td" instructions are used. This prevented line breaks + to be inserted when necessary. + . Temporarily commented out lines of code which were trying to interpret x-position : they were + unnecessarily inserting spaces inside words written in multiple chunks + . Fix : 2-bytes codes can not only specified as hex digits, but also as ascii characters. Eg : "bh" + means : 0x6268.Ths caused for example Chinese characters to be wrongly interpreted on a document + generated from OpenOffice to PrimoPdf. + + [Version : 1.2.41] [Date : 2016/08/10] [Author : CV] + . Series of hex digits that represent characters and are related to unmapped fonts using the WinAnsi or + MacRoman encoding scheme where inappropriately split into chunks of 4 digits instead of 2. This + caused in some occasions normal characters to be interpreted as far-east languages, such as Chinese. + + [Version : 1.2.40] [Date : 2016/08/09] [Author : CV] + . Changed the way the PdfToText::$CharacterClass array is initialized. It was using constructs such as : + [ 'a' => self::CTYPE_ALNUM | self::CTYPE_XDIGIT, ... ] + which is authorized only for PHP versions >= 5.6. + + [Version : 1.2.39] [Date : 2016/08/09] [Author : CV] + . Entirely rewrote the PdfTexterPageMap class, which was incorrectly handling nested page descriptions. + . In the Load() method, changed the way text is extracted from objects : instead of starting from the + list of available text objects and trying to retrieve their page number, the method now loops through + page numbers (defined in the PageMap object property) and use the associated text object ids to + extract their contents. + + [Version : 1.2.38] [Date : 2016/08/09] [Author : CV] + . Bug fix : The PdfTexterFont::MapCharacter() method was not modified after the transition to a better + Unicode-to-UTF8 translation ; it was still accepting characters, while it should have been accepting + integer values (character codes). + . Bug fix : unwanted headers and footers were not recognized appropriately in some cases. + + [Version : 1.2.37] [Date : 2016/08/08] [Author : CV] + . (optimization) Checking against header or footer data is now made in the ExtractText() method, instead + of __next_instruction(), which caused too many calls to the preg_match() function. + . Added the IsPageHeaderOrFooter() method. + . Bug fix : Positive offsets between two text groups were unduly taken into account for the number of + spaces to be inserted between those groups (negative offsets add spacing, while positive ones are + subtracted from the current x-position). + . Bug fix : Space insertion for relative x-positioning did not take into account the last x position. + + [Version : 1.2.36] [Date : 2016/08/07] [Author : CV] + . Added the PDFOPT_NO_HYPHENATED_WORDS option to remove hyphens that break words on two lines. + . (optimization) Introduced a static array giving the character class for some characters + (alpha, digit, etc.) + . Bug fix : characters present in plain text were translated to Ascii NUL + . Bug fix : the __next_token() function was also returning the next character after character codes + specified within angle brackets ("<>"), which caused extra NUL values to be displayed in the output. + + [Version : 1.2.35] [Date : 2016/08/06] [Author : CV] + . (optimization) Reduced the number of times author information is scanned in pdf objects. + . (optimization) Removed useless calls to str_pad(), strcasecmp() and substr(). + . Optimized the __next_token() method + + [Version : 1.2.34] [Date : 2016/08/05] [Author : CV] + . Reduced the number of calls to certain built-in functions (ctype_* functions) + . Font maps were stored using only the number part of their specification (eg, "1_0" for "/C1_0"). This + led to override existing fonts using different notations (ie, "/C1_0" will override an existing font + map that was declared using "/T1_0"). The consequence is that sometimes, the charater map used to + translate text was not the appropriate one, hence some badly displayed characters. + . Fixed an "uninitialized string offset" PHP notice in the __next_token() method. + . Fixed some cases where too many line breaks were inserted between two lines. + + [Version : 1.2.33] [Date : 2016/08/05] [Author : CV] + . For optimization reasons, reduced the number of times certain methods were called (GetMapWidth, + PeekAuthorInformation, IsMapped, GetFontByMapId, __get_character_padding, ...) + . Removed a regular expression for reducing drawing contents size which was a little bit too greedy + and caused some characters to be removed from plain text. + + [Version : 1.2.32] [Date : 2016/08/05] [Author : CV] + . Optimized regular expressions that remove useless Adobe Postscript instructions and added new ones. + . Rewrote the CodePointToUtf8() method. + . Handled a new way to specify font aliases : /TTx. + + [Version : 1.2.31] [Date : 2016/08/04] [Author : CV] + . Removed irrelevant Postscript instructions from text streams (such as graphical drawing instructions), + to reduce the work of the tokenizer (the __next_token() method), which is written in PHP and not as + efficient as a tokenizer written in C. + Removal is done using the preg_replace() function. + . Character translation results are now buffered, to avoid unnecessary calls to the MapCharacter() + method of the PdfTexterFontTable class. + + [Version : 1.2.30] [Date : 2016/08/02] [Author : CV] + . Handle object streams, which is a way to group several objects into a single pdf object (in the same + stream). The object flags are : /Type/ObjStm. This explains why certain paragraphs were missing in + certain PDF samples : they were simply "hidden" in object streams. + . The static variable PdfToText::$Utf8PlaceHolder can now include a format to be used for substitutions + of Unicode characters that cannot be translated ; one parameter will be passed to the sprintf() + function before putting it in the output text, the Unicode code point (an integer value). + . The default value for the PdfToText::$Utf8PlaceHolder, when in debug mode, is : + '[unknown character 0x%08X]' + Note that the $DEBUG static variable must be set BEFORE the first instantiation of a PdfToText object. + + [Version : 1.2.29] [Date : 2016/08/02] [Author : CV] + . Changed the way the PdfTexterUnicodeCharacterMap class handles character ranges to reduce memory usage + for PDF files defining numerous ranges in character maps. A sample that needed more than 128Mb of + memory now runs correctly with a memory limit of 32Mb. + . Corrected an incorrect reference to self::$DEBUG in PdfObjectBase class. + . Added the IsObjectStream() method. + + [Version : 1.2.28] [Date : 2016/08/01] [Author : CV] + . Better handle Unicode translations. Added the CodePointToUtf8() method. + . Added the static PdfToText::$Utf8Placeholder property, which is used when a Unicode character could + not be converted to an UTF8 string. + + [Version : 1.2.27] [Date : 2016/08/01] [Author : CV] + . Corrected a bad if() condition in the __next_token() method which caused the message 'Unitialized + string offset xxx' to sometimes occur. + . Added the PDFOPT_IGNORE_TEXT_LEADING option. This option must be used when you notice that an + unnecessary amount of empty lines are inserted between two text elements. This is the symptom that + the pdf file contains only relative positioning instructions combined with big values of text leading + instructions. + . For text fonts not having character maps, take into account the encoding specified in the font + attributes, such as WinAnsi or MacOsRoman, where some characters codes cannot be directly mapped to + Unicode characters (this was causing characters such as the Euro or (TM) signs to be incorrectly + translated). + + [Version : 1.2.26] [Date : 2016/07/30] [Author : CV] + . Throw an exception when an unsupported encoding format or when bad flate decoding data is encountered + only if self::$DEBUG is greater than 1. + . Added the EOL property, which is used for line breaks in the extracted text. Default is PHP_EOL. + . Some text constructs can contain a continuation line, such as : + (this is a sentence \ + split over two lines) + Removed the continuation line sequence, which caused unnecessary line break. + + [Version : 1.2.25] [Date : 2016/07/28] [Author : CV] + . Handled yet another way to specify a font resource : /C0_0, /C0_1, etc. It behaves like the /Fx and + /fx-y notations, in the sense they are a way to associate a font resource object with some kind of + alias (although the pdf specification only talks about /Fx). + + [Version : 1.2.24] [Date : 2016/07/28] [Author : CV] + . Decimal numbers not having a leading zero were not recognized as decimal numbers in text coordinates. + . Page maps using the /Kids and /Count flags can be nested ; the top-level /Kids page map contains the + sum of all the pages in its /Kids descendents for its /Count parameter. The warning signalling this + discrepancy has been disabled when not in debug mode, but the PdfTexterPageMap class will need to be + reviewed to handle this new situation. + + [Version : 1.2.23] [Date : 2016/07/27] [Author : CV] + . Added the following properties : Author, CreatorApplication, ProducerApplication, CreationDate and + ModificationDate + . Added the PeekAuthorInformation() internal method to retrieve the values of the above properties, if + present. + . Added the GetUTCDate() method to the PdfObjectBase class to reformat dates from Adobe UTC format to + an UTC format that can be understood by the strtotime() function (some dates may for example be + specified in the following format : 20160707182114+02'00', where the '00' string is not recognized) + + [Version : 1.2.22] [Date : 2016/07/26] [Author : CV] + . The BlockSeparator property was not used is some cases, which caused certain data presented in a + certain format by certain Pdf generators to appear catenated. + + [Version : 1.2.21] [Date : 2016/07/26] [Author : CV] + . Changed the Unescape() method which did not handle at all character specifications using the octal + notation. + + [Version : 1.2.20] [Date : 2016/07/24] [Author : CV] + . When encountering an unrecognized FlateDecode stream, throws an exception only if the $DEBUG global + variable is non-false, otherwise ignores the stream data. + This is a temporary measure until I find out how to properly decode such encoded streams correctly. + + [Version : 1.2.19] [Date : 2016/07/19] [Author : CV] + . The class do not process any more image contents by default. The following flags can now be specified + to the constructor if image data is to be retrieved : + . PDFOPT_GET_IMAGE_DATA : + Will put raw (undecoded) image data in the new $ImageData[] array property. + . PDFOPT_DECODE_IMAGE_DATA : + Will use the graphics glib library to create a jpeg resource from the raw data + encountered in the PDF stream. Specifying this flag automatically enables the + PDFOPT_GET_IMAGE_DATA flag. + . The new $ImageData property is an array of associative arrays that contains the following entries : + . 'type' - + Image type. Can be one of the following : + . 'jpeg' - + Jpeg image type. + Note that in the current version, only jpeg images are processed, until I find the + method to decode other proprietary Adobe formats. + . 'data' - + Raw image data. + + [Version : 1.2.18] [Date : 2016/07/05] [Author : CV] + . For debugging purposes, added the $object_id parameter to the DecodeData() method. + . The DecodeData() method now throws an exception if the stream object does not contain valid gzip data + . Handled the case of empty streams (!), such as : + 18 0 obj + << + /Filter /FlateDecode + /Length 0 + >> + stream + + endstream + endobj + which was causing warnings from the gzuncompress() function. + + [Version : 1.2.17] [Date : 2016/07/02] [Author : CV] + . Avoided processing of empty text blocks, which were causing extraneous line breaks in the output + . For debugging purposes, added a 'token' element in the associative array returned by the + __next_instruction() method. + . Made the difference between absolute positioning instructions ("Tm") and relative ones ("Td" and "TD") + which are often used in tabular data, by introducing the $last_relative_goto_y variable in the + ExtractText() method (individual cell contents were broken into separate lines). + + [Version : 1.2.16] [Date : 2016/07/01] [Author : CV] + . No line break was inserted when relative positioning instructions (Td and TD) were encountered. This + caused consecutive lines to be joined together. + + [Version : 1.2.15] [Date : 2016/06/30] [Author : CV] + . Corrected the __extract_chars_from_array() methods, which incorrectly handled escaped characters in + text groups and ate up the next character following the escaped one. For example, the following group : + [(3)-3(4)-3(.)11(5)-3(\(f\) a)9(n)4(d)-3( 3)6(4)-3(.6\()6(g)-3(\)\()8(2)4(\), )4(t)-3(h)-3(a)8(t)] + was represented as : + 34.5(f)nd 34.6(g)2)that + instead of : + 34.5(f) and 34.6(g)(2), that + + [Version : 1.2.14] [Date : 2016/06/24] [Author : CV] + . Added the __get_character_padding() method to compute the number of spaces needed between two + chunks of characters, taking into account the MinSpaceWidth property. + + [Version : 1.2.13] [Date : 2016/06/23] [Author : CV] + . Took into account the relative x-offset specified with Td/TD instructions + . Added the MinSpaceWidth property, which is to be measured in thousands of text units, to help the + class determine if spaces should be inserted between two character units. The default value is 250. + Although the value can be less than 1000, only a multiple of 1000 units will determine the total + number of spaces to be inserted if the PDF_REPEAT_SEPARATOR flag is set in the Options property. + . Relaxed a little bit the cases where a newline should be inserted + . Don't add the BlockSeparator string if the Separator and BlockSeparator properties are the same, and + the current result ends with the Separator string. When both properties are set to a space, this + avoids inserting double space between column elements. + + [Version : 1.2.12] [Date : 2016/06/21] [Author : CV] + . Better handle relative positioning instructions so that text parts supposed to be on the same line + stay on the same line. + . Added the PageSeparator property + + [Version : 1.2.11] [Date : 2016/06/19] [Author : CV] + . Renamed the "Separator" property to "BlockSeparator" + . The "Separator" property is now used as a separator for notations such as : + [(1)-1000(2)] + where "-1000" is a value that is subtracted to the current x-position. Some pdf documents presenting + tabular data use this characteristic to separate text in columns. The default value is " " (white + space). + . Added the PDFOPT_* option constants, which can either be specified to the class constructor or changed + by setting the new "Options" property. The only flag available for now is PDFOPT_REPEAT_SEPARATOR, which + has an interest if the offset between two text chunks is less than -2000 ; for example, the following + construct : + + [(1)-2000(2)] + + will give the string "1 2" if the PDF_REPEAT_SEPARATOR flag is not set, and "1 2" if set (assuming + the Separator property is set to a space). + + [Version : 1.2.10] [Date : 2016/06/16] [Author : CV] + . The character after an octal notation was skipped. For example, (\101 X) was rendered as "AX" instead + of "A X". + + [Version : 1.2.9] [Date : 2016/06/15] [Author : CV] + . Array of characters which included line breaks were not correctly interpreted + . Added the Separator property, which can be used to separate chunks of text that are recognized to be + on the same line. This is useful for pdf documents that contain mainly tabular data, but it could + break words if it contains textual data. + . Handled a new strange way to specify font numbers (/f-x-y instead of /Fx). + + [Version : 1.2.8] [Date : 2016/06/12] [Author : CV] + . Corrected the visibility of the Isxxx() methods, which were public instead of protected. + . Some positioning instructions can be cumulated (a 'Tm' can be followed by a 'Td'). Consider that the + last instruction wins. + + [Version : 1.2.7] [Date : 2016/06/11] [Author : CV] + . Added the Images array property, which makes available the images found in the document. The elements + of this array are image data. + . Added a few PDF_*_ENCODING constants which were missing. Not all encoding types have been implemented, + however. + . Added the IsImage() and DecodeImage() methods. + . Added the PdfImage class + . To simplify the management of this source between the Thrak framework and the specific version made + for publishing on phpclasses, added the following : + - error() and warning() functions + - PdfToTextException class + + [Version : 1.2.6] [Date : 2016/06/08] [Author : CV] + . Stream/endstream contents can be unencoded and appear in clear text ! Added the PDF_TEXT_ENCODING + constant to handle this case. + . Changed the regular expression in IsFontMap() to allow spaces between "<<" and the first "/F". + . Character maps strike again ! after the issue uncovered in version 1.2.2, where constructs such as : + + <012B> <00660067> + + means "replace every reference to 0x0012B with unicode characters 0x0066 and 0x0067 (for maps having + a width of 2 bytes), I discovered that some pdf documents having character maps of 1 byte could hold + entries such as : + + <03> <0020> + + which simply means "replace every reference to 0x03 with character 0x20"... I have not seen any + differentiating factor between the sample I handled in version 1.2.2 and this one. All I can say is + that I put a horrible kludge in PdfTexterUnicodeMap::offsetGet(), to handle a situation were + character widths are one-byte long, and their substitutions can be 2-bytes long, with a leading byte + of zero. S..t. + + [Version : 1.2.5] [Date : 2016/06/07] [Author : CV] + . Tried to enhance performance by first looking for objects that contain stream/endstream constructs, + to avoid unnecessary detections of character maps, font definitions, etc. + . Consecutive text shapes introduced by the "Do" instruction were gathered on the same text line. A + line break is now inserted when a "Do" instruction is encountered. + + [Version : 1.2.4] [Date : 2016/06/03] [Author : CV] + . Introduced the PdfObjectBase class, from which all the classes defined here inherit. Moved all general + methods at this level. + . Added the PdfPageMap class + + [Version : 1.2.3] [Date : 2016/06/01] [Author : CV] + . Found a PDF coming from MAC outer galaxies where some lines were terminated by "\r\n", and some other + by "\r". + . Added more debugging messages when the $DEBUG static class variable is set to an integer value + greater than 1. + . Character references to a CMAP can be specified as \xyz, where "xyz" are octal digits + . The begincmap/endcmap constructs can be omitted, which initially was the criteria to determine if the + current object is a character map. Checked for the presence of beginbfchar/beginbfrange in this case. + + [Version : 1.2.2] [Date : 2016/05/31] [Author : CV] + . Modified the PdfTexterUnicodeMap class to handle cases where substitutions in beginbfchar/endbfchar + constructs contains several characters. For example : + + <012B> <00660067> + + which means that a reference to character #012B must be substituted with #0066 and #0067. + + [Version : 1.2.1] [Date : 2016/05/28] [Author : CV] + . Changed the regular expression to match stream/endstream constructs because it captured too much as + in the following example : + + << ... /Type /Stream >> + stream + ... + endstream + + (the captured data was : " >>\nstream..."). Now a stream construct is detected if not preceded by a + slash. + . Found one case where the beginbfchar/enbfchar and all its contents were put on one line, thus making + the regular expression for capturing characters to fail. Hope this will not happen with beginbfrange... + + [Version : 1.1] [Date : 2016/05/21] [Author : CV] + . Added support to retrieve the page number associated to a character offset in the Text contents. New + methods are : + - GetPageFromOffset + - text_strpos/text_stripos + - document_strpos/document_stripos + - text_match/document_match + + [Version : 1.0.1] [Date : 2016/05/12] [Author : CV] + . Added code to ignore page headers and footers, which caused unnecessary newlines to be added to the + output (handling page headers and footers would require to break the code). + . The last y-position was not correctly tracked in some cases. + + [Version : 1.0] [Date : 2016/04/16] [Author : CV] + Initial version. diff --git a/LICENSE b/LICENSE index fd33bce..b88d68a 100755 --- a/LICENSE +++ b/LICENSE @@ -1,172 +1,172 @@ -Copyright (c) 2016, Christian Vigh. -All rights reserved. - -The software included in this package adheres to the GNU Lesser General -Public License, a copy of which has been put below : - - GNU LESSER GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - - This version of the GNU Lesser General Public License incorporates -the terms and conditions of version 3 of the GNU General Public -License, supplemented by the additional permissions listed below. - - 0. Additional Definitions. - - As used herein, "this License" refers to version 3 of the GNU Lesser -General Public License, and the "GNU GPL" refers to version 3 of the GNU -General Public License. - - "The Library" refers to a covered work governed by this License, -other than an Application or a Combined Work as defined below. - - An "Application" is any work that makes use of an interface provided -by the Library, but which is not otherwise based on the Library. -Defining a subclass of a class defined by the Library is deemed a mode -of using an interface provided by the Library. - - A "Combined Work" is a work produced by combining or linking an -Application with the Library. The particular version of the Library -with which the Combined Work was made is also called the "Linked -Version". - - The "Minimal Corresponding Source" for a Combined Work means the -Corresponding Source for the Combined Work, excluding any source code -for portions of the Combined Work that, considered in isolation, are -based on the Application, and not on the Linked Version. - - The "Corresponding Application Code" for a Combined Work means the -object code and/or source code for the Application, including any data -and utility programs needed for reproducing the Combined Work from the -Application, but excluding the System Libraries of the Combined Work. - - 1. Exception to Section 3 of the GNU GPL. - - You may convey a covered work under sections 3 and 4 of this License -without being bound by section 3 of the GNU GPL. - - 2. Conveying Modified Versions. - - If you modify a copy of the Library, and, in your modifications, a -facility refers to a function or data to be supplied by an Application -that uses the facility (other than as an argument passed when the -facility is invoked), then you may convey a copy of the modified -version: - - a) under this License, provided that you make a good faith effort to - ensure that, in the event an Application does not supply the - function or data, the facility still operates, and performs - whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of - this License applicable to that copy. - - 3. Object Code Incorporating Material from Library Header Files. - - The object code form of an Application may incorporate material from -a header file that is part of the Library. You may convey such object -code under terms of your choice, provided that, if the incorporated -material is not limited to numerical parameters, data structure -layouts and accessors, or small macros, inline functions and templates -(ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the - Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license - document. - - 4. Combined Works. - - You may convey a Combined Work under terms of your choice that, -taken together, effectively do not restrict modification of the -portions of the Library contained in the Combined Work and reverse -engineering for debugging such modifications, if you also do each of -the following: - - a) Give prominent notice with each copy of the Combined Work that - the Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license - document. - - c) For a Combined Work that displays copyright notices during - execution, include the copyright notice for the Library among - these notices, as well as a reference directing the user to the - copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this - License, and the Corresponding Application Code in a form - suitable for, and under terms that permit, the user to - recombine or relink the Application with a modified version of - the Linked Version to produce a modified Combined Work, in the - manner specified by section 6 of the GNU GPL for conveying - Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (a) uses at run time - a copy of the Library already present on the user's computer - system, and (b) will operate properly with a modified version - of the Library that is interface-compatible with the Linked - Version. - - e) Provide Installation Information, but only if you would otherwise - be required to provide such information under section 6 of the - GNU GPL, and only to the extent that such information is - necessary to install and execute a modified version of the - Combined Work produced by recombining or relinking the - Application with a modified version of the Linked Version. (If - you use option 4d0, the Installation Information must accompany - the Minimal Corresponding Source and Corresponding Application - Code. If you use option 4d1, you must provide the Installation - Information in the manner specified by section 6 of the GNU GPL - for conveying Corresponding Source.) - - 5. Combined Libraries. - - You may place library facilities that are a work based on the -Library side by side in a single library together with other library -facilities that are not Applications and are not covered by this -License, and convey such a combined library under terms of your -choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based - on the Library, uncombined with any other library facilities, - conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it - is a work based on the Library, and explaining where to find the - accompanying uncombined form of the same work. - - 6. Revised Versions of the GNU Lesser General Public License. - - The Free Software Foundation may publish revised and/or new versions -of the GNU Lesser General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - - Each version is given a distinguishing version number. If the -Library as you received it specifies that a certain numbered version -of the GNU Lesser General Public License "or any later version" -applies to it, you have the option of following the terms and -conditions either of that published version or of any later version -published by the Free Software Foundation. If the Library as you -received it does not specify a version number of the GNU Lesser -General Public License, you may choose any version of the GNU Lesser -General Public License ever published by the Free Software Foundation. - - If the Library as you received it specifies that a proxy can decide -whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is -permanent authorization for you to choose that version for the -Library. - +Copyright (c) 2016, Christian Vigh. +All rights reserved. + +The software included in this package adheres to the GNU Lesser General +Public License, a copy of which has been put below : + + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. + diff --git a/Maps/adobe-charsets.map b/Maps/adobe-charsets.map index a452f1a..28e0c67 100755 --- a/Maps/adobe-charsets.map +++ b/Maps/adobe-charsets.map @@ -1,967 +1,967 @@ - array ( 0101, 0101, 0101, 0101 ), - 'AE' => array ( 0341, 0256, 0306, 0306 ), - 'Aacute' => array ( 0, 0347, 0301, 0301 ), - 'Acircumflex' => array ( 0, 0345, 0302, 0302 ), - 'Adieresis' => array ( 0, 0200, 0304, 0304 ), - 'Agrave' => array ( 0, 0313, 0300, 0300 ), - 'Aring' => array ( 0, 0201, 0305, 0305 ), - 'Atilde' => array ( 0, 0314, 0303, 0303 ), - 'B' => array ( 0102, 0102, 0102, 0102 ), - 'C' => array ( 0103, 0103, 0103, 0103 ), - 'Ccedilla' => array ( 0, 0202, 0307, 0307 ), - 'D' => array ( 0104, 0104, 0104, 0104 ), - 'E' => array ( 0105, 0105, 0105, 0105 ), - 'Eacute' => array ( 0, 0203, 0311, 0311 ), - 'Ecircumflex' => array ( 0, 0346, 0312, 0312 ), - 'Edieresis' => array ( 0, 0350, 0313, 0313 ), - 'Egrave' => array ( 0, 0351, 0310, 0310 ), - 'Eth' => array ( 0, 0, 0320, 0320 ), - 'Euro' => array ( 0, 0, 0200, 0240 ), - 'F' => array ( 0106, 0106, 0106, 0106 ), - 'G' => array ( 0107, 0107, 0107, 0107 ), - 'H' => array ( 0110, 0110, 0110, 0110 ), - 'I' => array ( 0111, 0111, 0111, 0111 ), - 'Iacute' => array ( 0, 0352, 0315, 0315 ), - 'Icircumflex' => array ( 0, 0353, 0316, 0316 ), - 'Idieresis' => array ( 0, 0354, 0317, 0317 ), - 'Igrave' => array ( 0, 0355, 0314, 0314 ), - 'J' => array ( 0112, 0112, 0112, 0112 ), - 'K' => array ( 0113, 0113, 0113, 0113 ), - 'L' => array ( 0114, 0114, 0114, 0114 ), - 'Lslash' => array ( 0x0141, 0x0141, 0x0141, 0x0141 ), - 'M' => array ( 0115, 0115, 0115, 0115 ), - 'N' => array ( 0116, 0116, 0116, 0116 ), - 'Ntilde' => array ( 0, 0204, 0321, 0321 ), - 'O' => array ( 0117, 0117, 0117, 0117 ), - 'OE' => array ( 0352, 0316, 0214, 0226 ), - 'Oacute' => array ( 0, 0356, 0323, 0323 ), - 'Ocircumflex' => array ( 0, 0357, 0324, 0324 ), - 'Odieresis' => array ( 0, 0205, 0326, 0326 ), - 'Ograve' => array ( 0, 0361, 0322, 0322 ), - 'Oslash' => array ( 0351, 0257, 0330, 0330 ), - 'Otilde' => array ( 0, 0315, 0325, 0325 ), - 'P' => array ( 0120, 0120, 0120, 0120 ), - 'Q' => array ( 0121, 0121, 0121, 0121 ), - 'R' => array ( 0122, 0122, 0122, 0122 ), - 'S' => array ( 0123, 0123, 0123, 0123 ), - 'Scaron' => array ( 0, 0, 0212, 0227 ), - 'T' => array ( 0124, 0124, 0124, 0124 ), - 'Thorn' => array ( 0, 0, 0336, 0336 ), - 'U' => array ( 0125, 0125, 0125, 0125 ), - 'Uacute' => array ( 0, 0362, 0332, 0332 ), - 'Ucircumflex' => array ( 0, 0363, 0333, 0333 ), - 'Udieresis' => array ( 0, 0206, 0334, 0334 ), - 'Ugrave' => array ( 0, 0364, 0331, 0331 ), - 'V' => array ( 0126, 0126, 0126, 0126 ), - 'W' => array ( 0127, 0127, 0127, 0127 ), - 'X' => array ( 0130, 0130, 0130, 0130 ), - 'Y' => array ( 0131, 0131, 0131, 0131 ), - 'Yacute' => array ( 0, 0, 0335, 0335 ), - 'Ydieresis' => array ( 0, 0331, 0237, 0230 ), - 'Z' => array ( 0132, 0132, 0132, 0132 ), - 'Zcaron' => array ( 0, 0, 0216, 0231 ), - 'a' => array ( 0141, 0141, 0141, 0141 ), - 'aacute' => array ( 0, 0207, 0341, 0341 ), - 'acircumflex' => array ( 0, 0211, 0342, 0342 ), - 'acute' => array ( 0302, 0253, 0264, 0264 ), - 'adieresis' => array ( 0, 0212, 0344, 0344 ), - 'ae' => array ( 0361, 0276, 0346, 0346 ), - 'agrave' => array ( 0, 0210, 0340, 0340 ), - 'ampersand' => array ( 0046, 0046, 0046, 0046 ), - 'aring' => array ( 0, 0214, 0345, 0345 ), - 'asciicircum' => array ( 0136, 0136, 0136, 0136 ), - 'asciitilde' => array ( 0176, 0176, 0176, 0176 ), - 'asterisk' => array ( 0052, 0052, 0052, 0052 ), - 'at' => array ( 0100, 0100, 0100, 0100 ), - 'atilde' => array ( 0, 0213, 0343, 0343 ), - 'b' => array ( 0142, 0142, 0142, 0142 ), - 'backslash' => array ( 0134, 0134, 0134, 0134 ), - 'bar' => array ( 0174, 0174, 0174, 0174 ), - 'braceleft' => array ( 0173, 0173, 0173, 0173 ), - 'braceright' => array ( 0175, 0175, 0175, 0175 ), - 'bracketleft' => array ( 0133, 0133, 0133, 0133 ), - 'bracketright' => array ( 0135, 0135, 0135, 0135 ), - 'breve' => array ( 0306, 0371, 0, 0030 ), - 'brokenbar' => array ( 0, 0, 0246, 0246 ), - 'bullet' => array ( 0267, 0245, 0225, 0200 ), - 'c' => array ( 0143, 0143, 0143, 0143 ), - 'caron' => array ( 0317, 0377, 0, 0031 ), - 'ccedilla' => array ( 0, 0215, 0347, 0347 ), - 'cedilla' => array ( 0313, 0374, 0270, 0270 ), - 'cent' => array ( 0242, 0242, 0242, 0242 ), - 'circumflex' => array ( 0303, 0366, 0210, 0032 ), - 'colon' => array ( 0072, 0072, 0072, 0072 ), - 'comma' => array ( 0054, 0054, 0054, 0054 ), - 'copyright' => array ( 0, 0251, 0251, 0251 ), - 'currency' => array ( 0250, 0333, 0244, 0244 ), - 'd' => array ( 0144, 0144, 0144, 0144 ), - 'dagger' => array ( 0262, 0240, 0206, 0201 ), - 'daggerdbl' => array ( 0263, 0340, 0207, 0202 ), - 'degree' => array ( 0, 0241, 0260, 0260 ), - 'dieresis' => array ( 0310, 0254, 0250, 0250 ), - 'divide' => array ( 0, 0326, 0367, 0367 ), - 'dollar' => array ( 0044, 0044, 0044, 0044 ), - 'dotaccent' => array ( 0307, 0372, 0, 0033 ), - 'dotlessi' => array ( 0365, 0365, 0x131, 0232 ), - 'e' => array ( 0145, 0145, 0145, 0145 ), - 'eacute' => array ( 0, 0216, 0351, 0351 ), - 'ecircumflex' => array ( 0, 0220, 0352, 0352 ), - 'edieresis' => array ( 0, 0221, 0353, 0353 ), - 'egrave' => array ( 0, 0217, 0350, 0350 ), - 'eight' => array ( 0070, 0070, 0070, 0070 ), - 'elipsis' => array ( 0x2026, 0x2026, 0x2026, 0x2026 ), - 'ellipsis' => array ( 0x2026, 0x2026, 0x2026, 0x2026 ), - 'emdash' => array ( 0x2D, 0x2D, 0x2D, 0x2D ), - 'endash' => array ( 0x2D, 0x2D, 0x2D, 0x2D ), - 'equal' => array ( 0075, 0075, 0075, 0075 ), - 'eth' => array ( 0, 0, 0360, 0360 ), - 'exclam' => array ( 0041, 0041, 0041, 0041 ), - 'exclamdown' => array ( 0241, 0301, 0241, 0241 ), - 'f' => array ( 0146, 0146, 0146, 0146 ), - 'fi' => array ( 0xFB01, 0xFB01, 0xFB01, 0xFB01 ), - 'five' => array ( 0065, 0065, 0065, 0065 ), - 'ff' => array ( 0xFB00, 0xFB00, 0xFB00, 0xFB00 ), - 'fl' => array ( 0xFB02, 0xFB02, 0xFB02, 0xFB02 ), - 'ffi' => array ( 0xFB03, 0xFB03, 0xFB03, 0xFB03 ), - 'ffl' => array ( 0xFB04, 0xFB04, 0xFB04, 0xFB04 ), - 'florin' => array ( 0246, 0304, 0203, 0206 ), - 'four' => array ( 0064, 0064, 0064, 0064 ), - 'fraction' => array ( 0244, 0332, 0, 0207 ), - 'g' => array ( 0147, 0147, 0147, 0147 ), - 'germandbls' => array ( 0373, 0247, 0337, 0337 ), - 'grave' => array ( 0301, 0140, 0140, 0140 ), - 'greater' => array ( 0076, 0076, 0076, 0076 ), - 'guillemotleft' => array ( 0253, 0307, 0253, 0253 ), - 'guillemotright' => array ( 0273, 0310, 0273, 0273 ), - 'guilsinglleft' => array ( 0254, 0334, 0213, 0210 ), - 'guilsinglright' => array ( 0255, 0335, 0233, 0211 ), - 'h' => array ( 0150, 0150, 0150, 0150 ), - 'hungarumlaut' => array ( 0315, 0375, 0, 0034 ), - 'hyphen' => array ( 0x2D, 0x2D, 0x2D, 0x2D ), - 'i' => array ( 0151, 0151, 0151, 0151 ), - 'iacute' => array ( 0, 0222, 0355, 0355 ), - 'icircumflex' => array ( 0, 0224, 0356, 0356 ), - 'idieresis' => array ( 0, 0225, 0357, 0357 ), - 'igrave' => array ( 0, 0223, 0354, 0354 ), - 'j' => array ( 0152, 0152, 0152, 0152 ), - 'k' => array ( 0153, 0153, 0153, 0153 ), - 'l' => array ( 0154, 0154, 0154, 0154 ), - 'less' => array ( 0074, 0074, 0074, 0074 ), - 'logicalnot' => array ( 0, 0302, 0254, 0254 ), - 'lslash' => array ( 0x0142, 0x0142, 0x0142, 0x0142 ), - 'm' => array ( 0155, 0155, 0155, 0155 ), - 'macron' => array ( 0305, 0370, 0257, 0257 ), - 'minus' => array ( 0x2D, 0x2D, 0x2D, 0x2D ), - 'mu' => array ( 0, 0265, 0265, 0265 ), - 'multiply' => array ( 0, 0, 0327, 0327 ), - 'n' => array ( 0156, 0156, 0156, 0156 ), - 'nine' => array ( 0071, 0071, 0071, 0071 ), - 'ntilde' => array ( 0, 0226, 0361, 0361 ), - 'numbersign' => array ( 0043, 0043, 0043, 0043 ), - 'o' => array ( 0157, 0157, 0157, 0157 ), - 'oacute' => array ( 0, 0227, 0363, 0363 ), - 'ocircumflex' => array ( 0, 0231, 0364, 0364 ), - 'odieresis' => array ( 0, 0232, 0366, 0366 ), - 'oe' => array ( 0372, 0317, 0234, 0234 ), - 'ogonek' => array ( 0x2DB, 0x2DB, 0x2DB, 0x2DB ), - 'ograve' => array ( 0, 0230, 0362, 0362 ), - 'one' => array ( 0061, 0061, 0061, 0061 ), - 'onehalf' => array ( 0, 0, 0275, 0275 ), - 'onequarter' => array ( 0, 0, 0274, 0274 ), - 'ordfeminine' => array ( 0343, 0273, 0252, 0252 ), - 'ordmasculine' => array ( 0353, 0274, 0272, 0272 ), - 'oslash' => array ( 0371, 0277, 0370, 0370 ), - 'otilde' => array ( 0, 0233, 0365, 0365 ), - 'p' => array ( 0160, 0160, 0160, 0160 ), - 'paragraph' => array ( 0266, 0246, 0266, 0266 ), - 'parenleft' => array ( 0050, 0050, 0050, 0050 ), - 'parenright' => array ( 0051, 0051, 0051, 0051 ), - 'percent' => array ( 0045, 0045, 0045, 0045 ), - 'period' => array ( 0056, 0056, 0056, 0056 ), - 'periodcentered' => array ( 0264, 0341, 0267, 0267 ), - 'perthousand' => array ( 0275, 0344, 0211, 0213 ), - 'plus' => array ( 0053, 0053, 0053, 0053 ), - 'plusminus' => array ( 0, 0261, 0261, 0261 ), - 'q' => array ( 0161, 0161, 0161, 0161 ), - 'question' => array ( 0077, 0077, 0077, 0077 ), - 'questiondown' => array ( 0277, 0300, 0277, 0277 ), - 'quotedbl' => array ( 0x22, 0x22, 0x22, 0x22 ), - 'quotedblbase' => array ( 0x22, 0x22, 0x22, 0x22 ), - 'quotedblleft' => array ( 0x22, 0x22, 0x22, 0x22 ), - 'quotedblright' => array ( 0x22, 0x22, 0x22, 0x22 ), - 'quoteleft' => array ( 0x27, 0x27, 0x27, 0x27 ), - 'quoteright' => array ( 0x22, 0x22, 0x22, 0x22 ), - 'quotesinglbase' => array ( 0x22, 0x22, 0x22, 0x22 ), - 'quotesingle' => array ( 0x22, 0x22, 0x22, 0x22 ), - 'r' => array ( 0162, 0162, 0162, 0162 ), - 'registered' => array ( 0, 0250, 0256, 0256 ), - 'ring' => array ( 0312, 0373, 0xB0, 0036 ), - 's' => array ( 0163, 0163, 0163, 0163 ), - 'scaron' => array ( 0, 0, 0232, 0235 ), - 'section' => array ( 0247, 0244, 0247, 0247 ), - 'semicolon' => array ( 0073, 0073, 0073, 0073 ), - 'seven' => array ( 0067, 0067, 0067, 0067 ), - 'six' => array ( 0066, 0066, 0066, 0066 ), - 'slash' => array ( 0057, 0057, 0057, 0057 ), - 'space' => array ( 0040, 0040, 0040, 0040 ), - 'sterling' => array ( 0243, 0243, 0243, 0243 ), - 't' => array ( 0164, 0164, 0164, 0164 ), - 'thorn' => array ( 0, 0, 0376, 0376 ), - 'three' => array ( 0063, 0063, 0063, 0063 ), - 'threequarters' => array ( 0, 0, 0276, 0276 ), - 'tilde' => array ( 0304, 0367, 0230, 0037 ), - 'trademark' => array ( 0, 0252, 0231, 0222 ), - 'two' => array ( 0062, 0062, 0062, 0062 ), - 'u' => array ( 0165, 0165, 0165, 0165 ), - 'uacute' => array ( 0, 0234, 0372, 0372 ), - 'ucircumflex' => array ( 0, 0236, 0373, 0373 ), - 'udieresis' => array ( 0, 0237, 0374, 0374 ), - 'ugrave' => array ( 0, 0235, 0371, 0371 ), - 'underscore' => array ( 0137, 0137, 0137, 0137 ), - 'v' => array ( 0166, 0166, 0166, 0166 ), - 'w' => array ( 0167, 0167, 0167, 0167 ), - 'x' => array ( 0170, 0170, 0170, 0170 ), - 'y' => array ( 0171, 0171, 0171, 0171 ), - 'yacute' => array ( 0, 0, 0375, 0375 ), - 'ydieresis' => array ( 0, 0330, 0377, 0377 ), - 'yen' => array ( 0245, 0264, 0245, 0245 ), - 'z' => array ( 0172, 0172, 0172, 0172 ), - 'zcaron' => array ( 0, 0, 0236, 0236 ), - 'zero' => array ( 0060, 0060, 0060, 0060 ), - - // Additions which are not described in the PDF specifications - much more foreign characters are available ! - // (see https://mupdf.com/docs/browse/source/pdf/pdf-glyphlist.h.html) - // The following also gives some glyph names : - // http://www.tipometar.org/pojmovnik/Hint/img/Using%20Fontographer.pdf - // This table is currently far from being complete - 'Abreve' => array ( 0x0102, 0x0102, 0x0102, 0x0102 ), - 'abreve' => array ( 0x0103, 0x0103, 0x0103, 0x0103 ), - 'Abreveacute' => array ( 0x1EAE, 0x1EAE, 0x1EAE, 0x1EAE ), - 'abreveacute' => array ( 0x1EAF, 0x1EAF, 0x1EAF, 0x1EAF ), - 'Abrevedotbelow' => array ( 0x1EB6, 0x1EB6, 0x1EB6, 0x1EB6 ), - 'abrevedotbelow' => array ( 0x1EB7, 0x1EB7, 0x1EB7, 0x1EB7 ), - 'Abrevegrave' => array ( 0x1EB0, 0x1EB0, 0x1EB0, 0x1EB0 ), - 'abrevegrave' => array ( 0x1EB1, 0x1EB1, 0x1EB1, 0x1EB1 ), - 'Abrevehookabove' => array ( 0x1EB2, 0x1EB2, 0x1EB2, 0x1EB2 ), - 'abrevehookabove' => array ( 0x1EB3, 0x1EB3, 0x1EB3, 0x1EB3 ), - 'Abrevetilde' => array ( 0x1EB4, 0x1EB4, 0x1EB4, 0x1EB4 ), - 'abrevetilde' => array ( 0x1EB5, 0x1EB5, 0x1EB5, 0x1EB5 ), - 'Acaron' => array ( 0x01CD, 0x01CD, 0x01CD, 0x01CD ), - 'acaron' => array ( 0x01CE, 0x01CE, 0x01CE, 0x01CE ), - 'Acircumflexacute' => array ( 0x1EA4, 0x1EA4, 0x1EA4, 0x1EA4 ), - 'acircumflexacute' => array ( 0x1EA5, 0x1EA5, 0x1EA5, 0x1EA5 ), - 'Acircumflexdotbelow' => array ( 0x1EAC, 0x1EAC, 0x1EAC, 0x1EAC ), - 'acircumflexdotbelow' => array ( 0x1EAD, 0x1EAD, 0x1EAD, 0x1EAD ), - 'Acircumflexgrave' => array ( 0x1EA6, 0x1EA6, 0x1EA6, 0x1EA6 ), - 'acircumflexgrave' => array ( 0x1EA7, 0x1EA7, 0x1EA7, 0x1EA7 ), - 'Acircumflexhookabove' => array ( 0x1EA8, 0x1EA8, 0x1EA8, 0x1EA8 ), - 'acircumflexhookabove' => array ( 0x1EA9, 0x1EA9, 0x1EA9, 0x1EA9 ), - 'Acircumflextilde' => array ( 0x1EAA, 0x1EAA, 0x1EAA, 0x1EAA ), - 'acircumflextilde' => array ( 0x1EAB, 0x1EAB, 0x1EAB, 0x1EAB ), - 'acutecomb' => array ( 0x0301, 0x0301, 0x0301, 0x0301 ), - 'Adot' => array ( 0x0226, 0x0226, 0x0226, 0x0226 ), - 'adot' => array ( 0x0227, 0x0227, 0x0227, 0x0227 ), - 'Adotbelow' => array ( 0x1EA0, 0x1EA0, 0x1EA0, 0x1EA0 ), - 'adotbelow' => array ( 0x1EA1, 0x1EA1, 0x1EA1, 0x1EA1 ), - 'AEacute' => array ( 0x01FC, 0x01FC, 0x01FC, 0x01FC ), - 'aeacute' => array ( 0x01FD, 0x01FD, 0x01FD, 0x01FD ), - 'Adieresis' => array ( 0x00C4, 0x00C4, 0x00C4, 0x00C4 ), - 'adieresis' => array ( 0x00E4, 0x00E4, 0x00E4, 0x00E4 ), - 'Ahookabove' => array ( 0x1EA2, 0x1EA2, 0x1EA2, 0x1EA2 ), - 'ahookabove' => array ( 0x1EA3, 0x1EA3, 0x1EA3, 0x1EA3 ), - 'Amacron' => array ( 0x0100, 0x0100, 0x0100, 0x0100 ), - 'amacron' => array ( 0x0101, 0x0101, 0x0101, 0x0101 ), - 'Aogonek' => array ( 0x0104, 0x0104, 0x0104, 0x0104 ), - 'aogonek' => array ( 0x0105, 0x0105, 0x0105, 0x0105 ), - 'Aring' => array ( 0x00C5, 0x00C5, 0x00C5, 0x00C5 ), - 'aring' => array ( 0x00E5, 0x00E5, 0x00E5, 0x00E5 ), - 'Aringacute' => array ( 0x01FA, 0x01FA, 0x01FA, 0x01FA ), - 'aringacute' => array ( 0x01FB, 0x01FB, 0x01FB, 0x01FB ), - 'Atilde' => array ( 0x00C3, 0x00C3, 0x00C3, 0x00C3 ), - 'atilde' => array ( 0x00E3, 0x00E3, 0x00E3, 0x00E3 ), - 'Cacute' => array ( 0x0106, 0x0106, 0x0106, 0x0106 ), - 'cacute' => array ( 0x0107, 0x0107, 0x0107, 0x0107 ), - 'Ccaron' => array ( 0x010C, 0x010C, 0x010C, 0x010C ), - 'ccaron' => array ( 0x010D, 0x010D, 0x010D, 0x010D ), - 'Ccircumflex' => array ( 0x0108, 0x0108, 0x0108, 0x0108 ), - 'ccircumflex' => array ( 0x0109, 0x0109, 0x0109, 0x0109 ), - 'Cdot' => array ( 0x010A, 0x010A, 0x010A, 0x010A ), - 'cdot' => array ( 0x010B, 0x010B, 0x010B, 0x010B ), - 'Cdotaccent' => array ( 0x010A, 0x010A, 0x010A, 0x010A ), - 'cdotaccent' => array ( 0x010B, 0x010B, 0x010B, 0x010B ), - 'Dcaron' => array ( 0x010E, 0x010E, 0x010E, 0x010E ), - 'dcaron' => array ( 0x010F, 0x010F, 0x010F, 0x010F ), - 'Dcedilla' => array ( 0x1E10, 0x1E10, 0x1E10, 0x1E10 ), - 'dcedilla' => array ( 0x1E11, 0x1E11, 0x1E11, 0x1E11 ), - 'Dcroat' => array ( 0x0110, 0x0110, 0x0110, 0x0110 ), - 'dcroat' => array ( 0x0111, 0x0111, 0x0111, 0x0111 ), - 'Dmacron' => array ( 0x0110, 0x0110, 0x0110, 0x0110 ), - 'dmacron' => array ( 0x0111, 0x0111, 0x0111, 0x0111 ), - 'dotbelowcomb' => array ( 0x0323, 0x0323, 0x0323, 0x0323 ), - 'Dslash' => array ( 0x0110, 0x0110, 0x0110, 0x0110 ), - 'dslash' => array ( 0x0111, 0x0111, 0x0111, 0x0111 ), - 'Ebreve' => array ( 0x0114, 0x0114, 0x0114, 0x0114 ), - 'ebreve' => array ( 0x0115, 0x0115, 0x0115, 0x0115 ), - 'Ecaron' => array ( 0x011A, 0x011A, 0x011A, 0x011A ), - 'ecaron' => array ( 0x011B, 0x011B, 0x011B, 0x011B ), - 'Ecedilla' => array ( 0x0228, 0x0228, 0x0228, 0x0228 ), - 'ecedilla' => array ( 0x0229, 0x0229, 0x0229, 0x0229 ), - 'Ecircumflexacute' => array ( 0x1EBE, 0x1EBE, 0x1EBE, 0x1EBE ), - 'ecircumflexacute' => array ( 0x1EBF, 0x1EBF, 0x1EBF, 0x1EBF ), - 'Ecircumflexdotbelow' => array ( 0x1EC6, 0x1EC6, 0x1EC6, 0x1EC6 ), - 'ecircumflexdotbelow' => array ( 0x1EC7, 0x1EC7, 0x1EC7, 0x1EC7 ), - 'Ecircumflexgrave' => array ( 0x1EC0, 0x1EC0, 0x1EC0, 0x1EC0 ), - 'ecircumflexgrave' => array ( 0x1EC1, 0x1EC1, 0x1EC1, 0x1EC1 ), - 'Ecircumflexhookabove' => array ( 0x1EC2, 0x1EC2, 0x1EC2, 0x1EC2 ), - 'ecircumflexhookabove' => array ( 0x1EC3, 0x1EC3, 0x1EC3, 0x1EC3 ), - 'Ecircumflextilde' => array ( 0x1EC4, 0x1EC4, 0x1EC4, 0x1EC4 ), - 'ecircumflextilde' => array ( 0x1EC5, 0x1EC5, 0x1EC5, 0x1EC5 ), - 'Edieresis' => array ( 0x00CB, 0x00CB, 0x00CB, 0x00CB ), - 'edieresis' => array ( 0x00EB, 0x00EB, 0x00EB, 0x00EB ), - 'Edot' => array ( 0x0116, 0x0116, 0x0116, 0x0116 ), - 'edot' => array ( 0x0117, 0x0117, 0x0117, 0x0117 ), - 'Edotaccent' => array ( 0x0116, 0x0116, 0x0116, 0x0116 ), - 'edotaccent' => array ( 0x0117, 0x0117, 0x0117, 0x0117 ), - 'Edotbelow' => array ( 0x1EB8, 0x1EB8, 0x1EB8, 0x1EB8 ), - 'edotbelow' => array ( 0x1EB9, 0x1EB9, 0x1EB9, 0x1EB9 ), - 'Ehookabove' => array ( 0x1EBA, 0x1EBA, 0x1EBA, 0x1EBA ), - 'ehookabove' => array ( 0x1EBB, 0x1EBB, 0x1EBB, 0x1EBB ), - 'Emacron' => array ( 0x0112, 0x0112, 0x0112, 0x0112 ), - 'emacron' => array ( 0x0113, 0x0113, 0x0113, 0x0113 ), - 'Eng' => array ( 0x014A, 0x014A, 0x014A, 0x014A ), - 'eng' => array ( 0x014B, 0x014B, 0x014B, 0x014B ), - 'Eogonek' => array ( 0x0118, 0x0118, 0x0118, 0x0118 ), - 'eogonek' => array ( 0x0119, 0x0119, 0x0119, 0x0119 ), - 'Ering' => array ( 0x016E, 0x016E, 0x016E, 0x016E ), - 'ering' => array ( 0x016F, 0x016F, 0x016F, 0x016F ), - 'Etilde' => array ( 0x1EBC, 0x1EBC, 0x1EBC, 0x1EBC ), - 'etilde' => array ( 0x1EBD, 0x1EBD, 0x1EBD, 0x1EBD ), - 'Gacute' => array ( 0x01F4, 0x01F4, 0x01F4, 0x01F4 ), - 'gacute' => array ( 0x01F5, 0x01F5, 0x01F5, 0x01F5 ), - 'Gbreve' => array ( 0x011E, 0x011E, 0x011E, 0x011E ), - 'gbreve' => array ( 0x011F, 0x011F, 0x011F, 0x011F ), - 'Gcaron' => array ( 0x01E6, 0x01E6, 0x01E6, 0x01E6 ), - 'gcaron' => array ( 0x01E7, 0x01E7, 0x01E7, 0x01E7 ), - 'Gcedilla' => array ( 0x0122, 0x0122, 0x0122, 0x0122 ), - 'gcedilla' => array ( 0x0123, 0x0123, 0x0123, 0x0123 ), - 'Gcommaaccent' => array ( 0x0122, 0x0122, 0x0122, 0x0122 ), - 'gcommaaccent' => array ( 0x0123, 0x0123, 0x0123, 0x0123 ), - 'Gcircumflex' => array ( 0x011C, 0x011C, 0x011C, 0x011C ), - 'gcircumflex' => array ( 0x011D, 0x011D, 0x011D, 0x011D ), - 'Gdot' => array ( 0x0120, 0x0120, 0x0120, 0x0120 ), - 'gdot' => array ( 0x0121, 0x0121, 0x0121, 0x0121 ), - 'Gdotaccent' => array ( 0x0120, 0x0120, 0x0120, 0x0120 ), - 'gdotaccent' => array ( 0x0121, 0x0121, 0x0121, 0x0121 ), - 'gravecomb' => array ( 0x0300, 0x0300, 0x0300, 0x0300 ), - 'Hbar' => array ( 0x0126, 0x0126, 0x0126, 0x0126 ), - 'hbar' => array ( 0x0127, 0x0127, 0x0127, 0x0127 ), - 'Hcaron' => array ( 0x021E, 0x021E, 0x021E, 0x021E ), - 'hcaron' => array ( 0x021F, 0x021F, 0x021F, 0x021F ), - 'Hcedilla' => array ( 0x1E28, 0x1E28, 0x1E28, 0x1E28 ), - 'hcedilla' => array ( 0x1E29, 0x1E29, 0x1E29, 0x1E29 ), - 'Hcircumflex' => array ( 0x0124, 0x0124, 0x0124, 0x0124 ), - 'hcircumflex' => array ( 0x0125, 0x0125, 0x0125, 0x0125 ), - 'hookabovecomb' => array ( 0x0309, 0x0309, 0x0309, 0x0309 ), - 'Ibreve' => array ( 0x012C, 0x012C, 0x012C, 0x012C ), - 'ibreve' => array ( 0x012D, 0x012D, 0x012D, 0x012D ), - 'Icaron' => array ( 0x01CF, 0x01CF, 0x01CF, 0x01CF ), - 'icaron' => array ( 0x01D0, 0x01D0, 0x01D0, 0x01D0 ), - 'Idieresis' => array ( 0x00CF, 0x00CF, 0x00CF, 0x00CF ), - 'idieresis' => array ( 0x00EF, 0x00EF, 0x00EF, 0x00EF ), - 'Idot' => array ( 0x00CD, 0x00CD, 0x00CD, 0x00CD ), - 'idot' => array ( 0x00ED, 0x00ED, 0x00ED, 0x00ED ), - 'Idotaccent' => array ( 0x00CD, 0x00CD, 0x00CD, 0x00CD ), - 'idotaccent' => array ( 0x00ED, 0x00ED, 0x00ED, 0x00ED ), - 'Idotbelow' => array ( 0x1ECA, 0x1ECA, 0x1ECA, 0x1ECA ), - 'idotbelow' => array ( 0x1ECB, 0x1ECB, 0x1ECB, 0x1ECB ), - 'Ihookabove' => array ( 0x1EC8, 0x1EC8, 0x1EC8, 0x1EC8 ), - 'ihookabove' => array ( 0x1EC9, 0x1EC9, 0x1EC9, 0x1EC9 ), - 'IJ' => array ( 0x0132, 0x0132, 0x0132, 0x0132 ), - 'ij' => array ( 0x0133, 0x0133, 0x0133, 0x0133 ), - 'Imacron' => array ( 0x012A, 0x012A, 0x012A, 0x012A ), - 'imacron' => array ( 0x012B, 0x012B, 0x012B, 0x012B ), - 'Iogonek' => array ( 0x012E, 0x012E, 0x012E, 0x012E ), - 'iogonek' => array ( 0x012F, 0x012F, 0x012F, 0x012F ), - 'Itilde' => array ( 0x0128, 0x0128, 0x0128, 0x0128 ), - 'itilde' => array ( 0x0129, 0x0129, 0x0129, 0x0129 ), - 'Jcaron' => array ( 0x01F0, 0x01F0, 0x01F0, 0x01F0 ), - 'jcaron' => array ( 0x01EF, 0x01EF, 0x01EF, 0x01EF ), - 'Jcircumflex' => array ( 0x0134, 0x0134, 0x0134, 0x0134 ), - 'jcircumflex' => array ( 0x0135, 0x0135, 0x0135, 0x0135 ), - 'Kacute' => array ( 0x1E30, 0x1E30, 0x1E30, 0x1E30 ), - 'kacute' => array ( 0x1E31, 0x1E31, 0x1E31, 0x1E31 ), - 'kcaron' => array ( 0x01E9, 0x01E9, 0x01E9, 0x01E9 ), - 'Kcaron' => array ( 0x01E8, 0x01E8, 0x01E8, 0x01E8 ), - 'Kcedilla' => array ( 0x0136, 0x0136, 0x0136, 0x0136 ), - 'kcedilla' => array ( 0x0137, 0x0137, 0x0137, 0x0137 ), - 'Kcommaaccent' => array ( 0x0136, 0x0136, 0x0136, 0x0136 ), - 'kcommaaccent' => array ( 0x0137, 0x0137, 0x0137, 0x0137 ), - 'kgreenlandic' => array ( 0x0138, 0x0138, 0x0138, 0x0138 ), - 'Lacute' => array ( 0x0139, 0x0139, 0x0139, 0x0139 ), - 'lacute' => array ( 0x013A, 0x013A, 0x013A, 0x013A ), - 'lcaron' => array ( 0x013E, 0x013E, 0x013E, 0x013E ), - 'Lcaron' => array ( 0x013D, 0x013D, 0x013D, 0x013D ), - 'Lcedilla' => array ( 0x013B, 0x013B, 0x013B, 0x013B ), - 'lcedilla' => array ( 0x013C, 0x013C, 0x013C, 0x013C ), - 'Lcommaaccent' => array ( 0x013B, 0x013B, 0x013B, 0x013B ), - 'lcommaaccent' => array ( 0x013C, 0x013C, 0x013C, 0x013C ), - 'Ldot' => array ( 0x013F, 0x013F, 0x013F, 0x013F ), - 'ldot' => array ( 0x0140, 0x0140, 0x0140, 0x0140 ), - 'Macute' => array ( 0x1E3E, 0x1E3E, 0x1E3E, 0x1E3E ), - 'macute' => array ( 0x1E3F, 0x1E3F, 0x1E3F, 0x1E3F ), - 'nacute' => array ( 0x0144, 0x0144, 0x0144, 0x0144 ), - 'Nacute' => array ( 0x0143, 0x0143, 0x0143, 0x0143 ), - 'napostrophe' => array ( 0x0149, 0x0149, 0x0149, 0x0149 ), - 'nbspace' => array ( 0x0020, 0x0020, 0x0020, 0x0020 ), - 'Ncaron' => array ( 0x0147, 0x0147, 0x0147, 0x0147 ), - 'ncaron' => array ( 0x0148, 0x0148, 0x0148, 0x0148 ), - 'Ncedilla' => array ( 0x0145, 0x0145, 0x0145, 0x0145 ), - 'ncedilla' => array ( 0x0146, 0x0146, 0x0146, 0x0146 ), - 'Ncommaaccent' => array ( 0x0145, 0x0145, 0x0145, 0x0145 ), - 'ncommaaccent' => array ( 0x0146, 0x0146, 0x0146, 0x0146 ), - 'Ncircumflex' => array ( 0x1E4A, 0x1E4A, 0x1E4A, 0x1E4A ), - 'ncircumflex' => array ( 0x1E4B, 0x1E4B, 0x1E4B, 0x1E4B ), - 'Ntilde' => array ( 0x00D1, 0x00D1, 0x00D1, 0x00D1 ), - 'ntilde' => array ( 0x00F1, 0x00F1, 0x00F1, 0x00F1 ), - 'Obreve' => array ( 0x014E, 0x014E, 0x014E, 0x014E ), - 'obreve' => array ( 0x014F, 0x014F, 0x014F, 0x014F ), - 'Ocaron' => array ( 0x01D1, 0x01D1, 0x01D1, 0x01D1 ), - 'ocaron' => array ( 0x01D2, 0x01D2, 0x01D2, 0x01D2 ), - 'Ocedilla' => array ( 0x0156, 0x0156, 0x0156, 0x0156 ), - 'ocedilla' => array ( 0x0157, 0x0157, 0x0157, 0x0157 ), - 'Ocircumflexacute' => array ( 0x1ED0, 0x1ED0, 0x1ED0, 0x1ED0 ), - 'ocircumflexacute' => array ( 0x1ED1, 0x1ED1, 0x1ED1, 0x1ED1 ), - 'Ocircumflexdotbelow' => array ( 0x1ED8, 0x1ED8, 0x1ED8, 0x1ED8 ), - 'ocircumflexdotbelow' => array ( 0x1ED9, 0x1ED9, 0x1ED9, 0x1ED9 ), - 'Ocircumflexgrave' => array ( 0x1ED2, 0x1ED2, 0x1ED2, 0x1ED2 ), - 'ocircumflexgrave' => array ( 0x1ED3, 0x1ED3, 0x1ED3, 0x1ED3 ), - 'Ocircumflexhookabove' => array ( 0x1ED4, 0x1ED4, 0x1ED4, 0x1ED4 ), - 'ocircumflexhookabove' => array ( 0x1ED5, 0x1ED5, 0x1ED5, 0x1ED5 ), - 'Ocircumflextilde' => array ( 0x1ED6, 0x1ED6, 0x1ED6, 0x1ED6 ), - 'ocircumflextilde' => array ( 0x1ED7, 0x1ED7, 0x1ED7, 0x1ED7 ), - 'Odieresis' => array ( 0x00D6, 0x00D6, 0x00D6, 0x00D6 ), - 'odieresis' => array ( 0x00F6, 0x00F6, 0x00F6, 0x00F6 ), - 'Odot' => array ( 0x022E, 0x022E, 0x022E, 0x022E ), - 'odot' => array ( 0x022F, 0x022F, 0x022F, 0x022F ), - 'Odotbelow' => array ( 0x1ECC, 0x1ECC, 0x1ECC, 0x1ECC ), - 'odotbelow' => array ( 0x1ECD, 0x1ECD, 0x1ECD, 0x1ECD ), - 'Odblacute' => array ( 0x0150, 0x0150, 0x0150, 0x0150 ), - 'odblacute' => array ( 0x0151, 0x0151, 0x0151, 0x0151 ), - 'Ohookabove' => array ( 0x1ECE, 0x1ECE, 0x1ECE, 0x1ECE ), - 'ohookabove' => array ( 0x1ECF, 0x1ECF, 0x1ECF, 0x1ECF ), - 'Ohorn' => array ( 0x01A0, 0x01A0, 0x01A0, 0x01A0 ), - 'ohorn' => array ( 0x01A1, 0x01A1, 0x01A1, 0x01A1 ), - 'Ohornacute' => array ( 0x1EDA, 0x1EDA, 0x1EDA, 0x1EDA ), - 'ohornacute' => array ( 0x1EDB, 0x1EDB, 0x1EDB, 0x1EDB ), - 'Ohorndotbelow' => array ( 0x1EE2, 0x1EE2, 0x1EE2, 0x1EE2 ), - 'ohorndotbelow' => array ( 0x1EE3, 0x1EE3, 0x1EE3, 0x1EE3 ), - 'Ohorngrave' => array ( 0x1EDC, 0x1EDC, 0x1EDC, 0x1EDC ), - 'ohorngrave' => array ( 0x1EDD, 0x1EDD, 0x1EDD, 0x1EDD ), - 'Ohornhookabove' => array ( 0x1EDE, 0x1EDE, 0x1EDE, 0x1EDE ), - 'ohornhookabove' => array ( 0x1EDF, 0x1EDF, 0x1EDF, 0x1EDF ), - 'Ohorntilde' => array ( 0x1EE0, 0x1EE0, 0x1EE0, 0x1EE0 ), - 'ohorntilde' => array ( 0x1EE1, 0x1EE1, 0x1EE1, 0x1EE1 ), - 'Ohungarumlaut' => array ( 0x0150, 0x0150, 0x0150, 0x0150 ), - 'ohungarumlaut' => array ( 0x0151, 0x0151, 0x0151, 0x0151 ), - 'omacron' => array ( 0x014C, 0x014C, 0x014C, 0x014C ), - 'Omacron' => array ( 0x014D, 0x014D, 0x014D, 0x014D ), - 'Oogonek' => array ( 0x01EA, 0x01EA, 0x01EA, 0x01EA ), - 'oogonek' => array ( 0x01EB, 0x01EB, 0x01EB, 0x01EB ), - 'Oslashacute' => array ( 0x01FE, 0x01FE, 0x01FE, 0x01FE ), - 'oslashacute' => array ( 0x01FF, 0x01FF, 0x01FF, 0x01FF ), - 'Otilde' => array ( 0x00D5, 0x00D5, 0x00D5, 0x00D5 ), - 'otilde' => array ( 0x00F5, 0x00F5, 0x00F5, 0x00F5 ), - 'overscore' => array ( 0x00AF, 0x00AF, 0x00AF, 0x00AF ), - 'Pacute' => array ( 0x1E54, 0x1E54, 0x1E54, 0x1E54 ), - 'pacute' => array ( 0x1E55, 0x1E55, 0x1E55, 0x1E55 ), - 'Racute' => array ( 0x0154, 0x0154, 0x0154, 0x0154 ), - 'racute' => array ( 0x0155, 0x0155, 0x0155, 0x0155 ), - 'Rcaron' => array ( 0x0158, 0x0158, 0x0158, 0x0158 ), - 'rcaron' => array ( 0x0159, 0x0159, 0x0159, 0x0159 ), - 'Rcedilla' => array ( 0x0156, 0x0156, 0x0156, 0x0156 ), - 'rcedilla' => array ( 0x0157, 0x0157, 0x0157, 0x0157 ), - 'Rcommaaccent' => array ( 0x0156, 0x0156, 0x0156, 0x0156 ), - 'rcommaaccent' => array ( 0x0157, 0x0157, 0x0157, 0x0157 ), - 'Sacute' => array ( 0x015A, 0x015A, 0x015A, 0x015A ), - 'sacute' => array ( 0x015B, 0x015B, 0x015B, 0x015B ), - 'Scaron' => array ( 0x0160, 0x0160, 0x0160, 0x0160 ), - 'scaron' => array ( 0x0161, 0x0161, 0x0161, 0x0161 ), - 'Scedilla' => array ( 0x015E, 0x015E, 0x015E, 0x015E ), - 'scedilla' => array ( 0x015F, 0x015F, 0x015F, 0x015F ), - 'Scircumflex' => array ( 0x015C, 0x015C, 0x015C, 0x015C ), - 'scircumflex' => array ( 0x015D, 0x015D, 0x015D, 0x015D ), - 'Scommaaccent' => array ( 0x0218, 0x0218, 0x0218, 0x0218 ), - 'scommaaccent' => array ( 0x0219, 0x0219, 0x0219, 0x0219 ), - 'Tbar' => array ( 0x1E6E, 0x1E6E, 0x1E6E, 0x1E6E ), - 'tbar' => array ( 0x1E6F, 0x1E6F, 0x1E6F, 0x1E6F ), - 'Tcaron' => array ( 0x0164, 0x0164, 0x0164, 0x0164 ), - 'tcaron' => array ( 0x0165, 0x0165, 0x0165, 0x0165 ), - 'Tcedilla' => array ( 0x0162, 0x0162, 0x0162, 0x0162 ), - 'tcedilla' => array ( 0x0163, 0x0163, 0x0163, 0x0163 ), - 'Tcommaaccent' => array ( 0x0162, 0x0162, 0x0162, 0x0162 ), - 'tcommaaccent' => array ( 0x0163, 0x0163, 0x0163, 0x0163 ), - 'tildecomb' => array ( 0x0303, 0x0303, 0x0303, 0x0303 ), - 'Ubreve' => array ( 0x016C, 0x016C, 0x016C, 0x016C ), - 'ubreve' => array ( 0x016D, 0x016D, 0x016D, 0x016D ), - 'Ucaron' => array ( 0x01D3, 0x01D3, 0x01D3, 0x01D3 ), - 'uCaron' => array ( 0x01D4, 0x01D4, 0x01D4, 0x01D4 ), - 'Udblacute' => array ( 0x0170, 0x0170, 0x0170, 0x0170 ), - 'udblacute' => array ( 0x0171, 0x0171, 0x0171, 0x0171 ), - 'Udieresis' => array ( 0x00DC, 0x00DC, 0x00DC, 0x00DC ), - 'udieresis' => array ( 0x00FC, 0x00FC, 0x00FC, 0x00FC ), - 'Udotbelow' => array ( 0x1EE4, 0x1EE4, 0x1EE4, 0x1EE4 ), - 'udotbelow' => array ( 0x1EE5, 0x1EE5, 0x1EE5, 0x1EE5 ), - 'Uhookabove' => array ( 0x1EE6, 0x1EE6, 0x1EE6, 0x1EE6 ), - 'uhookabove' => array ( 0x1EE7, 0x1EE7, 0x1EE7, 0x1EE7 ), - 'Uhorn' => array ( 0x01AF, 0x01AF, 0x01AF, 0x01AF ), - 'uhorn' => array ( 0x01B0, 0x01B0, 0x01B0, 0x01B0 ), - 'Uhornacute' => array ( 0x1EE8, 0x1EE8, 0x1EE8, 0x1EE8 ), - 'uhornacute' => array ( 0x1EE9, 0x1EE9, 0x1EE9, 0x1EE9 ), - 'Uhorndotbelow' => array ( 0x1EF0, 0x1EF0, 0x1EF0, 0x1EF0 ), - 'uhorndotbelow' => array ( 0x1EF1, 0x1EF1, 0x1EF1, 0x1EF1 ), - 'Uhorngrave' => array ( 0x1EEA, 0x1EEA, 0x1EEA, 0x1EEA ), - 'uhorngrave' => array ( 0x1EEB, 0x1EEB, 0x1EEB, 0x1EEB ), - 'Uhornhookabove' => array ( 0x1EEC, 0x1EEC, 0x1EEC, 0x1EEC ), - 'uhornhookabove' => array ( 0x1EED, 0x1EED, 0x1EED, 0x1EED ), - 'Uhorntilde' => array ( 0x1EEE, 0x1EEE, 0x1EEE, 0x1EEE ), - 'uhorntilde' => array ( 0x1EEF, 0x1EEF, 0x1EEF, 0x1EEF ), - 'Uhungarumlaut' => array ( 0x0170, 0x0170, 0x0170, 0x0170 ), - 'uhungarumlaut' => array ( 0x0171, 0x0171, 0x0171, 0x0171 ), - 'Umacron' => array ( 0x016A, 0x016A, 0x016A, 0x016A ), - 'umacron' => array ( 0x016B, 0x016B, 0x016B, 0x016B ), - 'Uogonek' => array ( 0x0172, 0x0172, 0x0172, 0x0172 ), - 'uogonek' => array ( 0x0173, 0x0173, 0x0173, 0x0173 ), - 'Uring' => array ( 0x016E, 0x016E, 0x016E, 0x016E ), - 'uring' => array ( 0x016F, 0x016F, 0x016F, 0x016F ), - 'upsilondieresis' => array ( 0x00FF, 0x00FF, 0x00FF, 0x00FF ), - 'Upsilondieresis' => array ( 0x0178, 0x0178, 0x0178, 0x0178 ), - 'Utilde' => array ( 0x0168, 0x0168, 0x0168, 0x0168 ), - 'utilde' => array ( 0x0169, 0x0169, 0x0169, 0x0169 ), - 'Wacute' => array ( 0x1E82, 0x1E82, 0x1E82, 0x1E82 ), - 'wacute' => array ( 0x1E83, 0x1E83, 0x1E83, 0x1E83 ), - 'Wcircumflex' => array ( 0x0174, 0x0174, 0x0174, 0x0174 ), - 'wcircumflex' => array ( 0x0175, 0x0175, 0x0175, 0x0175 ), - 'Wdieresis' => array ( 0x1E84, 0x1E84, 0x1E84, 0x1E84 ), - 'wdieresis' => array ( 0x1E8E, 0x1E8E, 0x1E8E, 0x1E8E ), - 'Wgrave' => array ( 0x00C0, 0x00C0, 0x00C0, 0x00C0 ), - 'wgrave' => array ( 0x00E0, 0x00E0, 0x00E0, 0x00E0 ), - 'Yacute' => array ( 0x00DD, 0x00DD, 0x00DD, 0x00DD ), - 'yacute' => array ( 0x00DE, 0x00DE, 0x00DE, 0x00DE ), - 'Ycircumflex' => array ( 0x0176, 0x0176, 0x0176, 0x0176 ), - 'ycircumflex' => array ( 0x0177, 0x0177, 0x0177, 0x0177 ), - 'Ydieresis' => array ( 0x0178, 0x0178, 0x0178, 0x0178 ), - 'ydieresis' => array ( 0x00FF, 0x00FF, 0x00FF, 0x00FF ), - 'Ydotbelow' => array ( 0x1EF4, 0x1EF4, 0x1EF4, 0x1EF4 ), - 'ydotbelow' => array ( 0x1EF5, 0x1EF5, 0x1EF5, 0x1EF5 ), - 'Ygrave' => array ( 0x1EF2, 0x1EF2, 0x1EF2, 0x1EF2 ), - 'ygrave' => array ( 0x1EF3, 0x1EF3, 0x1EF3, 0x1EF3 ), - 'Yhookabove' => array ( 0x1EF6, 0x1EF6, 0x1EF6, 0x1EF6 ), - 'yhookabove' => array ( 0x1EF7, 0x1EF7, 0x1EF7, 0x1EF7 ), - 'Ytilde' => array ( 0x1EF8, 0x1EF8, 0x1EF8, 0x1EF8 ), - 'ytilde' => array ( 0x1EF9, 0x1EF9, 0x1EF9, 0x1EF9 ), - 'Zacute' => array ( 0x0179, 0x0179, 0x0179, 0x0179 ), - 'zacute' => array ( 0x017A, 0x017A, 0x017A, 0x017A ), - 'Zcaron' => array ( 0x017E, 0x017E, 0x017E, 0x017E ), - 'zcaron' => array ( 0x017D, 0x017D, 0x017D, 0x017D ), - 'Zcircumflex' => array ( 0x1E90, 0x1E90, 0x1E90, 0x1E90 ), - 'zcircumflex' => array ( 0x1E91, 0x1E91, 0x1E91, 0x1E91 ), - 'zdot' => array ( 0x017C, 0x017C, 0x017C, 0x017C ), - 'Zdot' => array ( 0x017B, 0x017B, 0x017B, 0x017B ), - 'zdotaccent' => array ( 0x017C, 0x017C, 0x017C, 0x017C ), - 'Zdotaccent' => array ( 0x017B, 0x017B, 0x017B, 0x017B ), - - // Special symbols - 'approxequal' => array ( 0x2248, 0x2248, 0x2248, 0x2248 ), - 'arrowleft' => array ( 0x2190, 0x2190, 0x2190, 0x2190 ), - 'arrowright' => array ( 0x2192, 0x2192, 0x2192, 0x2192 ), - 'block' => array ( 0x2588, 0x2588, 0x2588, 0x2588 ), - 'circle' => array ( 0x25CB, 0x25CB, 0x25CB, 0x25CB ), - 'club' => array ( 0x2663, 0x2663, 0x2663, 0x2663 ), - 'commaaccent' => array ( 0x002C, 0x002C, 0x002C, 0x002C ), - 'congruent' => array ( 0x2261, 0x2261, 0x2261, 0x2261 ), - 'dkshade' => array ( 0x2593, 0x2593, 0x2593, 0x2593 ), - 'dnblock' => array ( 0x2584, 0x2584, 0x2584, 0x2584 ), - 'eightsuperior' => array ( 0x2078, 0x2663, 0x2663, 0x2663 ), - 'emptyset' => array ( 0x2205, 0x2205, 0x2205, 0x2205 ), - 'equivalence' => array ( 0x2261, 0x2261, 0x2261, 0x2261 ), - 'estimated' => array ( 0x212E, 0x212E, 0x212E, 0x212E ), - 'exclamdbl' => array ( 0x203C, 0x203C, 0x203C, 0x203C ), - 'female' => array ( 0x2640, 0x2640, 0x2640, 0x2640 ), - 'filledbox' => array ( 0x25A0, 0x25A0, 0x25A0, 0x25A0 ), - 'filledrect' => array ( 0x25AC, 0x25AC, 0x25AC, 0x25AC ), - 'fiveeighths' => array ( 0x251D, 0x251D, 0x251D, 0x251D ), - 'fivesuperior' => array ( 0x2075, 0x2075, 0x2075, 0x2075 ), - 'foursuperior' => array ( 0x2074, 0x2074, 0x2074, 0x2074 ), - 'four.superior' => array ( 0x2074, 0x2074, 0x2074, 0x2074 ), - 'franc' => array ( 0x20A3, 0x20A3, 0x20A3, 0x20A3 ), - 'greaterequal' => array ( 0x2265, 0x2265, 0x2265, 0x2265 ), - 'heart' => array ( 0x2665, 0x2665, 0x2665, 0x2665 ), - 'house' => array ( 0x2302, 0x2302, 0x2302, 0x2302 ), - 'increment' => array ( 0x2206, 0x2206, 0x2206, 0x2206 ), - 'infinity' => array ( 0x221E, 0x221E, 0x221E, 0x221E ), - 'integral' => array ( 0x222B, 0x222B, 0x222B, 0x222B ), - 'integralbt' => array ( 0x2321, 0x2321, 0x2321, 0x2321 ), - 'integraltp' => array ( 0x2320, 0x2320, 0x2320, 0x2320 ), - 'intersection' => array ( 0x2229, 0x2229, 0x2229, 0x2229 ), - 'invbullet' => array ( 0x25D8, 0x25D8, 0x25D8, 0x25D8 ), - 'invcircle' => array ( 0x25D9, 0x25D9, 0x25D9, 0x25D9 ), - 'invsmileface' => array ( 0x263B, 0x263B, 0x263B, 0x263B ), - 'lessequal' => array ( 0x2264, 0x2264, 0x2264, 0x2264 ), - 'lfblock' => array ( 0x258C, 0x258C, 0x258C, 0x258C ), - 'lira' => array ( 0x20A4, 0x20A4, 0x20A4, 0x20A4 ), - 'ltshade' => array ( 0x2591, 0x2591, 0x2591, 0x2591 ), - 'longs' => array ( 0x017F, 0x017F, 0x017F, 0x017F ), - 'male' => array ( 0x2642, 0x2642, 0x2642, 0x2642 ), - 'middot' => array ( 0x00B7, 0x00B7, 0x00B7, 0x00B7 ), - 'minute' => array ( 0x2032, 0x2032, 0x2032, 0x2032 ), - 'musicalnote' => array ( 0x266A, 0x266A, 0x266A, 0x266A ), - 'musicalnotedbl' => array ( 0x266B, 0x266B, 0x266B, 0x266B ), - 'ninesuperior' => array ( 0x2079, 0x2079, 0x2079, 0x2079 ), - 'notequal' => array ( 0x2260, 0x2260, 0x2260, 0x2260 ), - 'nsuperior' => array ( 0x207F, 0x207F, 0x207F, 0x207F ), - 'Ohm' => array ( 0x2126, 0x2126, 0x2126, 0x2126 ), - 'ohm' => array ( 0x03C9, 0x03C9, 0x03C9, 0x03C9 ), - 'oneeighth' => array ( 0x215B, 0x215B, 0x215B, 0x215B ), - 'onesuperior' => array ( 0x2071, 0x2071, 0x2071, 0x2071 ), - 'one.superior' => array ( 0x2071, 0x2071, 0x2071, 0x2071 ), - 'onethird' => array ( 0x2153, 0x2153, 0x2153, 0x2153 ), - 'orthogonal' => array ( 0x221F, 0x221F, 0x221F, 0x221F ), - 'parenleftbt' => array ( 0x0028, 0x0028, 0x0028, 0x0028 ), - 'parenleftex' => array ( 0x0028, 0x0028, 0x0028, 0x0028 ), - 'parenlefttp' => array ( 0x0028, 0x0028, 0x0028, 0x0028 ), - 'parenrightbt' => array ( 0x0029, 0x0029, 0x0029, 0x0029 ), - 'parenrightex' => array ( 0x0029, 0x0029, 0x0029, 0x0029 ), - 'parenrighttp' => array ( 0x0029, 0x0029, 0x0029, 0x0029 ), - 'partialdiff' => array ( 0x2202, 0x2202, 0x2202, 0x2202 ), - 'peseta' => array ( 0x20A7, 0x20A7, 0x20A7, 0x20A7 ), - 'product' => array ( 0x220F, 0x220F, 0x220F, 0x220F ), - 'quotereversed' => array ( 0x201B, 0x201B, 0x201B, 0x201B ), - 'radical' => array ( 0x23B7, 0x23B7, 0x23B7, 0x23B7 ), - 'radicalex' => array ( 0x203E, 0x203E, 0x203E, 0x203E ), - 'revlogicalnot' => array ( 0x2310, 0x2310, 0x2310, 0x2310 ), - 'rtblock' => array ( 0x2590, 0x2590, 0x2590, 0x2590 ), - 'second' => array ( 0x2033, 0x2033, 0x2033, 0x2033 ), - 'seveneighths' => array ( 0x215E, 0x215E, 0x215E, 0x215E ), - 'sevensuperior' => array ( 0x2077, 0x2077, 0x2077, 0x2077 ), - 'shade' => array ( 0x2592, 0x2592, 0x2592, 0x2592 ), - 'similar' => array ( 0x2242, 0x2242, 0x2242, 0x2242 ), - 'smileface' => array ( 0x263A, 0x263A, 0x263A, 0x263A ), - 'sixsuperior' => array ( 0x2076, 0x2076, 0x2076, 0x2076 ), - 'spade' => array ( 0x2660, 0x2660, 0x2660, 0x2660 ), - 'summation' => array ( 0x2211, 0x2211, 0x2211, 0x2211 ), - 'sun' => array ( 0x263C, 0x263C, 0x263C, 0x263C ), - 'threeeighths' => array ( 0x215C, 0x215C, 0x215C, 0x215C ), - 'threesuperior' => array ( 0x00B3, 0x00B3, 0x00B3, 0x00B3 ), - 'three.superior' => array ( 0x00B3, 0x00B3, 0x00B3, 0x00B3 ), - 'triagdn' => array ( 0x25BC, 0x25BC, 0x25BC, 0x25BC ), - 'triaglf' => array ( 0x25C4, 0x25C4, 0x25C4, 0x25C4 ), - 'triagrt' => array ( 0x25BA, 0x25BA, 0x25BA, 0x25BA ), - 'triagup' => array ( 0x25B2, 0x25B2, 0x25B2, 0x25B2 ), - 'twosuperior' => array ( 0x00B2, 0x00B2, 0x00B2, 0x00B2 ), - 'two.superior' => array ( 0x00B2, 0x00B2, 0x00B2, 0x00B2 ), - 'twothirds' => array ( 0x2154, 0x2154, 0x2154, 0x2154 ), - 'undercommaaccent' => array ( 0x0326, 0x0326, 0x0326, 0x0326 ), - 'underscoredbl' => array ( 0x005F, 0x005F, 0x005F, 0x005F ), - 'upblock' => array ( 0x2580, 0x2580, 0x2580, 0x2580 ), - 'zerosuperior' => array ( 0x2070, 0x2070, 0x2070, 0x2070 ), - - // Greek characters - 'Alpha' => array ( 0x0391, 0x0391, 0x0391, 0x0391 ), - 'alpha' => array ( 0x03B1, 0x03B1, 0x03B1, 0x03B1 ), - 'Alphatonos' => array ( 0x0386, 0x0386, 0x0386, 0x0386 ), - 'alphatonos' => array ( 0x03AC, 0x03AC, 0x03AC, 0x03AC ), - 'anoteleia' => array ( 0x0387, 0x0387, 0x0387, 0x0387 ), - 'Beta' => array ( 0x0392, 0x0392, 0x0392, 0x0392 ), - 'beta' => array ( 0x03B2, 0x03B2, 0x03B2, 0x03B2 ), - 'Gamma' => array ( 0x0393, 0x0393, 0x0393, 0x0393 ), - 'gamma' => array ( 0x03B3, 0x03B3, 0x03B3, 0x03B3 ), - 'Delta' => array ( 0x0394, 0x0394, 0x0394, 0x0394 ), - 'Deltagreek' => array ( 0x0394, 0x0394, 0x0394, 0x0394 ), - 'delta' => array ( 0x03B4, 0x03B4, 0x03B4, 0x03B4 ), - 'dieresistonos' => array ( 0x0385, 0x0385, 0x0385, 0x0385 ), - 'Epsilon' => array ( 0x0395, 0x0395, 0x0395, 0x0395 ), - 'epsilon' => array ( 0x03B5, 0x03B5, 0x03B5, 0x03B5 ), - 'Epsilontonos' => array ( 0x0388, 0x0388, 0x0388, 0x0388 ), - 'epsilontonos' => array ( 0x03AD, 0x03AD, 0x03AD, 0x03AD ), - 'Etatonos' => array ( 0x0389, 0x0389, 0x0389, 0x0389 ), - 'etatonos' => array ( 0x03AD, 0x03AD, 0x03AD, 0x03AD ), - 'Zeta' => array ( 0x0396, 0x0396, 0x0396, 0x0396 ), - 'zeta' => array ( 0x03B6, 0x03B6, 0x03B6, 0x03B6 ), - 'Eta' => array ( 0x0397, 0x0397, 0x0397, 0x0397 ), - 'eta' => array ( 0x03B7, 0x03B7, 0x03B7, 0x03B7 ), - 'Theta' => array ( 0x0398, 0x0398, 0x0398, 0x0398 ), - 'theta' => array ( 0x03B8, 0x03B8, 0x03B8, 0x03B8 ), - 'Iota' => array ( 0x0399, 0x0399, 0x0399, 0x0399 ), - 'Iotadieresis' => array ( 0x03AA, 0x03AA, 0x03AA, 0x03AA ), - 'iotadieresis' => array ( 0x03CA, 0x03CA, 0x03CA, 0x03CA ), - 'iota' => array ( 0x03B9, 0x03B9, 0x03B9, 0x03B9 ), - 'iotadieresistonos' => array ( 0x0390, 0x0390, 0x0390, 0x0390 ), - 'Iotatonos' => array ( 0x038A, 0x038A, 0x038A, 0x038A ), - 'iotatonos' => array ( 0x03AF, 0x03AF, 0x03AF, 0x03AF ), - 'Kappa' => array ( 0x039A, 0x039A, 0x039A, 0x039A ), - 'kappa' => array ( 0x03BA, 0x03BA, 0x03BA, 0x03BA ), - 'Lambda' => array ( 0x039B, 0x039B, 0x039B, 0x039B ), - 'lambda' => array ( 0x03BB, 0x03BB, 0x03BB, 0x03BB ), - 'Mu' => array ( 0x039C, 0x039C, 0x039C, 0x039C ), - 'mu' => array ( 0x03BC, 0x03BC, 0x03BC, 0x03BC ), - 'Mu1' => array ( 0x039C, 0x039C, 0x039C, 0x039C ), - 'mu1' => array ( 0x03BC, 0x03BC, 0x03BC, 0x03BC ), - 'Nu' => array ( 0x039D, 0x039D, 0x039D, 0x039D ), - 'nu' => array ( 0x03BD, 0x03BD, 0x03BD, 0x03BD ), - 'Xi' => array ( 0x039E, 0x039E, 0x039E, 0x039E ), - 'xi' => array ( 0x03BE, 0x03BE, 0x03BE, 0x03BE ), - 'Omicron' => array ( 0x039F, 0x039F, 0x039F, 0x039F ), - 'omicron' => array ( 0x03BF, 0x03BF, 0x03BF, 0x03BF ), - 'Omicrontonos' => array ( 0x038C, 0x038C, 0x038C, 0x038C ), - 'omicrontonos' => array ( 0x03CC, 0x03CC, 0x03CC, 0x03CC ), - 'Pi' => array ( 0x03A0, 0x03A0, 0x03A0, 0x03A0 ), - 'pi' => array ( 0x03C0, 0x03C0, 0x03C0, 0x03C0 ), - 'Rho' => array ( 0x03A1, 0x03A1, 0x03A1, 0x03A1 ), - 'rho' => array ( 0x03C1, 0x03C1, 0x03C1, 0x03C1 ), - 'Sigma' => array ( 0x03A3, 0x03A3, 0x03A3, 0x03A3 ), - 'sigma' => array ( 0x03C3, 0x03C3, 0x03C3, 0x03C3 ), - 'Sigma1' => array ( 0x03A2, 0x03A2, 0x03A2, 0x03A2 ), - 'sigma1' => array ( 0x03C2, 0x03C2, 0x03C2, 0x03C2 ), - 'Tau' => array ( 0x03A4, 0x03A4, 0x03A4, 0x03A4 ), - 'tonos' => array ( 0x0384, 0x0384, 0x0384, 0x0384 ), - 'tau' => array ( 0x03C4, 0x03C4, 0x03C4, 0x03C4 ), - 'Upsilon' => array ( 0x03A5, 0x03A5, 0x03A5, 0x03A5 ), - 'upsilon' => array ( 0x03C5, 0x03C5, 0x03C5, 0x03C5 ), - 'Upsilondieresis' => array ( 0x03AB, 0x03AB, 0x03AB, 0x03AB ), - 'upsilondieresis' => array ( 0x03CB, 0x03CB, 0x03CB, 0x03CB ), - 'Upsilontonos' => array ( 0x038E, 0x038E, 0x038E, 0x038E ), - 'upsilontonos' => array ( 0x03CD, 0x03CD, 0x03CD, 0x03CD ), - 'upsilondieresistonos' => array ( 0x03B0, 0x03B0, 0x03B0, 0x03B0 ), - 'Phi' => array ( 0x03A6, 0x03A6, 0x03A6, 0x03A6 ), - 'phi' => array ( 0x03C6, 0x03C6, 0x03C6, 0x03C6 ), - 'Chi' => array ( 0x03A7, 0x03A7, 0x03A7, 0x03A7 ), - 'chi' => array ( 0x03C7, 0x03C7, 0x03C7, 0x03C7 ), - 'Psi' => array ( 0x03A8, 0x03A8, 0x03A8, 0x03A8 ), - 'psi' => array ( 0x03C8, 0x03C8, 0x03C8, 0x03C8 ), - 'Omega' => array ( 0x03A9, 0x03A9, 0x03A9, 0x03A9 ), - 'omega' => array ( 0x03C9, 0x03C9, 0x03C9, 0x03C9 ), - 'Omegatonos' => array ( 0x038F, 0x038F, 0x038F, 0x038F ), - 'omegatonos' => array ( 0x03CE, 0x03CE, 0x03CE, 0x03CE ), - - // http://www.tipometar.org/pojmovnik/Hint/img/Using%20Fontographer.pdf - // ftp://ftp.software.ibm.com/software/globalization/gcoc/attachments/CP00437.pdf - // http://jrgraphix.net/r/Unicode/2500-257F - // http://www.alanwood.net/demos/wingdings.html - // Almost everything is in the links ; the table blow needs to be completed, though - '.notdef' => array ( 0x0020, 0x0020, 0x0020, 0x0020 ), // Undefined (?) - 'afii00208' => array ( 0x002D, 0x002D, 0x002D, 0x002D ), // Minus - 'afii08941' => array ( 0x204A, 0x204A, 0x204A, 0x204A ), // Pound - 'afii10017' => array ( 0x0410, 0x0410, 0x0410, 0x0410 ), - 'afii10018' => array ( 0x0411, 0x0411, 0x0411, 0x0411 ), - 'afii10019' => array ( 0x0412, 0x0412, 0x0412, 0x0412 ), - 'afii10020' => array ( 0x0413, 0x0413, 0x0413, 0x0413 ), - 'afii10021' => array ( 0x0414, 0x0414, 0x0414, 0x0414 ), - 'afii10022' => array ( 0x0415, 0x0415, 0x0415, 0x0415 ), - 'afii10023' => array ( 0x0401, 0x0401, 0x0401, 0x0401 ), - 'afii10024' => array ( 0x0416, 0x0416, 0x0416, 0x0416 ), - 'afii10025' => array ( 0x0417, 0x0417, 0x0417, 0x0417 ), - 'afii10026' => array ( 0x0418, 0x0418, 0x0418, 0x0418 ), - 'afii10027' => array ( 0x0419, 0x0419, 0x0419, 0x0419 ), - 'afii10028' => array ( 0x041a, 0x041a, 0x041a, 0x041a ), - 'afii10029' => array ( 0x041b, 0x041b, 0x041b, 0x041b ), - 'afii10030' => array ( 0x041c, 0x041c, 0x041c, 0x041c ), - 'afii10031' => array ( 0x041d, 0x041d, 0x041d, 0x041d ), - 'afii10032' => array ( 0x041e, 0x041e, 0x041e, 0x041e ), - 'afii10033' => array ( 0x041f, 0x041f, 0x041f, 0x041f ), - 'afii10034' => array ( 0x0420, 0x0420, 0x0420, 0x0420 ), - 'afii10035' => array ( 0x0421, 0x0421, 0x0421, 0x0421 ), - 'afii10036' => array ( 0x0422, 0x0422, 0x0422, 0x0422 ), - 'afii10037' => array ( 0x0423, 0x0423, 0x0423, 0x0423 ), - 'afii10038' => array ( 0x0424, 0x0424, 0x0424, 0x0424 ), - 'afii10039' => array ( 0x0425, 0x0425, 0x0425, 0x0425 ), - 'afii10040' => array ( 0x0426, 0x0426, 0x0426, 0x0426 ), - 'afii10041' => array ( 0x0427, 0x0427, 0x0427, 0x0427 ), - 'afii10042' => array ( 0x0428, 0x0428, 0x0428, 0x0428 ), - 'afii10043' => array ( 0x0429, 0x0429, 0x0429, 0x0429 ), - 'afii10044' => array ( 0x042a, 0x042a, 0x042a, 0x042a ), - 'afii10045' => array ( 0x042b, 0x042b, 0x042b, 0x042b ), - 'afii10046' => array ( 0x042c, 0x042c, 0x042c, 0x042c ), - 'afii10047' => array ( 0x042d, 0x042d, 0x042d, 0x042d ), - 'afii10048' => array ( 0x042e, 0x042e, 0x042e, 0x042e ), - 'afii10049' => array ( 0x042f, 0x042f, 0x042f, 0x042f ), - 'afii10050' => array ( 0x0490, 0x0490, 0x0490, 0x0490 ), - 'afii10051' => array ( 0x0402, 0x0402, 0x0402, 0x0402 ), - 'afii10052' => array ( 0x0403, 0x0403, 0x0403, 0x0403 ), - 'afii10053' => array ( 0x0404, 0x0404, 0x0404, 0x0404 ), - 'afii10054' => array ( 0x0405, 0x0405, 0x0405, 0x0405 ), - 'afii10055' => array ( 0x0406, 0x0406, 0x0406, 0x0406 ), - 'afii10056' => array ( 0x0407, 0x0407, 0x0407, 0x0407 ), - 'afii10057' => array ( 0x0408, 0x0408, 0x0408, 0x0408 ), - 'afii10058' => array ( 0x0409, 0x0409, 0x0409, 0x0409 ), - 'afii10059' => array ( 0x040a, 0x040a, 0x040a, 0x040a ), - 'afii10060' => array ( 0x040b, 0x040b, 0x040b, 0x040b ), - 'afii10061' => array ( 0x040c, 0x040c, 0x040c, 0x040c ), - 'afii10062' => array ( 0x040e, 0x040e, 0x040e, 0x040e ), - 'afii10065' => array ( 0x0430, 0x0430, 0x0430, 0x0430 ), - 'afii10066' => array ( 0x0431, 0x0431, 0x0431, 0x0431 ), - 'afii10067' => array ( 0x0432, 0x0432, 0x0432, 0x0432 ), - 'afii10068' => array ( 0x0433, 0x0433, 0x0433, 0x0433 ), - 'afii10069' => array ( 0x0434, 0x0434, 0x0434, 0x0434 ), - 'afii10070' => array ( 0x0435, 0x0435, 0x0435, 0x0435 ), - 'afii10071' => array ( 0x0436, 0x0436, 0x0436, 0x0436 ), - 'afii10072' => array ( 0x0437, 0x0437, 0x0437, 0x0437 ), - 'afii10073' => array ( 0x0438, 0x0438, 0x0438, 0x0438 ), - 'afii10074' => array ( 0x0439, 0x0439, 0x0439, 0x0439 ), - 'afii10075' => array ( 0x043a, 0x043a, 0x043a, 0x043a ), - 'afii10076' => array ( 0x043b, 0x043b, 0x043b, 0x043b ), - 'afii10077' => array ( 0x043c, 0x043c, 0x043c, 0x043c ), - 'afii10078' => array ( 0x043d, 0x043d, 0x043d, 0x043d ), - 'afii10079' => array ( 0x043e, 0x043e, 0x043e, 0x043e ), - 'afii10080' => array ( 0x043f, 0x043f, 0x043f, 0x043f ), - 'afii10081' => array ( 0x0440, 0x0440, 0x0440, 0x0440 ), - 'afii10082' => array ( 0x0441, 0x0441, 0x0441, 0x0441 ), - 'afii10083' => array ( 0x0442, 0x0442, 0x0442, 0x0442 ), - 'afii10084' => array ( 0x0443, 0x0443, 0x0443, 0x0443 ), - 'afii10085' => array ( 0x0444, 0x0444, 0x0444, 0x0444 ), - 'afii10086' => array ( 0x0445, 0x0445, 0x0445, 0x0445 ), - 'afii10087' => array ( 0x0446, 0x0446, 0x0446, 0x0446 ), - 'afii10088' => array ( 0x0447, 0x0447, 0x0447, 0x0447 ), - 'afii10089' => array ( 0x0448, 0x0448, 0x0448, 0x0448 ), - 'afii10090' => array ( 0x0449, 0x0449, 0x0449, 0x0449 ), - 'afii10091' => array ( 0x044a, 0x044a, 0x044a, 0x044a ), - 'afii10092' => array ( 0x044b, 0x044b, 0x044b, 0x044b ), - 'afii10093' => array ( 0x044c, 0x044c, 0x044c, 0x044c ), - 'afii10094' => array ( 0x044d, 0x044d, 0x044d, 0x044d ), - 'afii10095' => array ( 0x044e, 0x044e, 0x044e, 0x044e ), - 'afii10096' => array ( 0x044f, 0x044f, 0x044f, 0x044f ), - 'afii10097' => array ( 0x0450, 0x0450, 0x0450, 0x0450 ), - 'afii10098' => array ( 0x0451, 0x0451, 0x0451, 0x0451 ), - 'afii10099' => array ( 0x0452, 0x0452, 0x0452, 0x0452 ), - 'afii10100' => array ( 0x0453, 0x0453, 0x0453, 0x0453 ), - 'afii10101' => array ( 0x0454, 0x0454, 0x0454, 0x0454 ), - 'afii10102' => array ( 0x0455, 0x0455, 0x0455, 0x0455 ), - 'afii10103' => array ( 0x0456, 0x0456, 0x0456, 0x0456 ), - 'afii10104' => array ( 0x0457, 0x0457, 0x0457, 0x0457 ), - 'afii10105' => array ( 0x0458, 0x0458, 0x0458, 0x0458 ), - 'afii10106' => array ( 0x0459, 0x0459, 0x0459, 0x0459 ), - 'afii10107' => array ( 0x045a, 0x045a, 0x045a, 0x045a ), - 'afii10108' => array ( 0x045b, 0x045b, 0x045b, 0x045b ), - 'afii10109' => array ( 0x045c, 0x045c, 0x045c, 0x045c ), - 'afii10110' => array ( 0x045E, 0x045E, 0x045E, 0x045E ), - 'afii10145' => array ( 0x040F, 0x040F, 0x040F, 0x040F ), - 'afii10193' => array ( 0x045F, 0x045F, 0x045F, 0x045F ), - 'afii61248' => array ( 0x2105, 0x2105, 0x2105, 0x2105 ), // English symbol "care of" - 'afii61289' => array ( 0x2113, 0x2113, 0x2113, 0x2113 ), // Lower "l de ronde" - 'afii61352' => array ( 0x2116, 0x2116, 0x2116, 0x2116 ), - 'H18543' => array ( 0x25A0, 0x25A0, 0x25A0, 0x25A0 ), // Black square - 'H18533' => array ( 0x25CF, 0x25CF, 0x25CF, 0x25CF ), // Black circle - 'H22073' => array ( 0x25A1, 0x25A1, 0x25A1, 0x25A1 ), // White square - 'H18551' => array ( 0x25AB, 0x25AB, 0x25AB, 0x25AB ), // White square with double horizontal borders - 'SF070000' => array ( 0x2534, 0x2534, 0x2534, 0x2534 ), // Semi-graphic - 'SF010000' => array ( 0x250C, 0x250C, 0x250C, 0x250C ), - 'SF020000' => array ( 0x2514, 0x2514, 0x2514, 0x2514 ), - 'SF030000' => array ( 0x2510, 0x2510, 0x2510, 0x2510 ), - 'SF040000' => array ( 0x2518, 0x2518, 0x2518, 0x2518 ), - 'SF050000' => array ( 0x253C, 0x253C, 0x253C, 0x253C ), - 'SF060000' => array ( 0x252C, 0x252C, 0x252C, 0x252C ), - 'SF070000' => array ( 0x2534, 0x2534, 0x2534, 0x2534 ), - 'SF080000' => array ( 0x251C, 0x251C, 0x251C, 0x251C ), - 'SF090000' => array ( 0x2524, 0x2524, 0x2524, 0x2524 ), - 'SF100000' => array ( 0x2501, 0x2501, 0x2501, 0x2501 ), - 'SF110000' => array ( 0x2502, 0x2502, 0x2502, 0x2502 ), - 'SF190000' => array ( 0x2561, 0x2561, 0x2561, 0x2561 ), - 'SF200000' => array ( 0x2562, 0x2562, 0x2562, 0x2562 ), - 'SF210000' => array ( 0x2556, 0x2556, 0x2556, 0x2556 ), - 'SF220000' => array ( 0x2555, 0x2555, 0x2555, 0x2555 ), - 'SF230000' => array ( 0x2563, 0x2563, 0x2563, 0x2563 ), - 'SF240000' => array ( 0x2551, 0x2551, 0x2551, 0x2551 ), - 'SF250000' => array ( 0x2557, 0x2557, 0x2557, 0x2557 ), - 'SF260000' => array ( 0x255D, 0x255D, 0x255D, 0x255D ), - 'SF270000' => array ( 0x255C, 0x255C, 0x255C, 0x255C ), - 'SF280000' => array ( 0x255B, 0x255B, 0x255B, 0x255B ), - 'SF360000' => array ( 0x255E, 0x255E, 0x255E, 0x255E ), - 'SF370000' => array ( 0x255F, 0x255F, 0x255F, 0x255F ), - 'SF380000' => array ( 0x255F, 0x255F, 0x255F, 0x255F ), - 'SF390000' => array ( 0x2554, 0x2554, 0x2554, 0x2554 ), - 'SF400000' => array ( 0x2569, 0x2569, 0x2569, 0x2569 ), - 'SF410000' => array ( 0x2566, 0x2566, 0x2566, 0x2566 ), - 'SF420000' => array ( 0x2560, 0x2560, 0x2560, 0x2560 ), - 'SF430000' => array ( 0x2550, 0x2550, 0x2550, 0x2550 ), - 'SF440000' => array ( 0x256C, 0x256C, 0x256C, 0x256C ), - 'SF450000' => array ( 0x2567, 0x2567, 0x2567, 0x2567 ), - 'SF460000' => array ( 0x2568, 0x2568, 0x2568, 0x2568 ), - 'SF470000' => array ( 0x2564, 0x2564, 0x2564, 0x2564 ), - 'SF480000' => array ( 0x2565, 0x2565, 0x2565, 0x2565 ), - 'SF490000' => array ( 0x2559, 0x2559, 0x2559, 0x2559 ), - 'SF500000' => array ( 0x2558, 0x2558, 0x2558, 0x2558 ), - 'SF510000' => array ( 0x2552, 0x2552, 0x2552, 0x2552 ), - 'SF520000' => array ( 0x2553, 0x2553, 0x2553, 0x2553 ), - 'SF530000' => array ( 0x256B, 0x256B, 0x256B, 0x256B ), - 'SF540000' => array ( 0x256A, 0x256A, 0x256A, 0x256A ), - - // Wingdings - 'arrowboth' => array ( 0x2194, 0x2194, 0x2194, 0x2194 ), - 'arrowdown' => array ( 0x2193, 0x2193, 0x2193, 0x2193 ), - 'arrowleft' => array ( 0x2190, 0x2190, 0x2190, 0x2190 ), - 'arrowright' => array ( 0x2192, 0x2192, 0x2192, 0x2192 ), - 'arrowup' => array ( 0x2191, 0x2191, 0x2191, 0x2191 ), - 'arrowupdn' => array ( 0x2195, 0x2195, 0x2195, 0x2195 ), - 'arrowupdnbse' => array ( 0x21A8, 0x21A8, 0x21A8, 0x21A8 ), - 'barb2left' => array ( 0x1F868, 0x1F868, 0x1F868, 0x1F868 ), // Wide-headed leftwards barb arrow - 'barb2right' => array ( 0x1F86A, 0x1F86A, 0x1F86A, 0x1F86A ), // Wide-headed rightwards barb arrow - 'barb2up' => array ( 0x1F869, 0x1F869, 0x1F869, 0x1F869 ), // Wide-headed upwards barb arrow - 'barb2down' => array ( 0x1F86B, 0x1F86B, 0x1F86B, 0x1F86B ), // Wide-headed downwards barb arrow - 'barb2nw' => array ( 0x1F86C, 0x1F86C, 0x1F86C, 0x1F86C ), // Wide-headed north west barb arrow - 'barb2ne' => array ( 0x1F86D, 0x1F86D, 0x1F86D, 0x1F86D ), // Wide-headed north east barb arrow - 'barb2sw' => array ( 0x1F86F, 0x1F86F, 0x1F86F, 0x1F86F ), // Wide-headed south west barb arrow - 'barb2se' => array ( 0x1F86E, 0x1F86E, 0x1F86E, 0x1F86E ), // Wide-headed south east barb arrow - 'barb4left' => array ( 0x1F878, 0x1F878, 0x1F878, 0x1F878 ), // Wide-headed leftwards barb arrow - 'barb4right' => array ( 0x1F87A, 0x1F87A, 0x1F87A, 0x1F87A ), // Wide-headed rightwards barb arrow - 'barb4up' => array ( 0x1F879, 0x1F879, 0x1F879, 0x1F879 ), // Wide-headed upwards barb arrow - 'barb4down' => array ( 0x1F87B, 0x1F87B, 0x1F87B, 0x1F87B ), // Wide-headed downwards barb arrow - 'barb4nw' => array ( 0x1F87C, 0x1F87C, 0x1F87C, 0x1F87C ), // Wide-headed north west barb arrow - 'barb4ne' => array ( 0x1F87D, 0x1F87D, 0x1F87D, 0x1F87D ), // Wide-headed north east barb arrow - 'barb4sw' => array ( 0x1F87F, 0x1F87F, 0x1F87F, 0x1F87F ), // Wide-headed south west barb arrow - 'barb4se' => array ( 0x1F87E, 0x1F87E, 0x1F87E, 0x1F87E ), // Wide-headed south east barb arrow - 'checkbld' => array ( 0x2714, 0x2714, 0x2714, 0x2714 ), // Heavy checkmark - 'diamond' => array ( 0x2666, 0x2666, 0x2666, 0x2666 ), - 'head2left' => array ( 0x2B98, 0x2B98, 0x2B98, 0x2B98 ), - 'head2right' => array ( 0x2B9A, 0x2B9A, 0x2B9A, 0x2B9A ), - 'head2up' => array ( 0x2B99, 0x2B99, 0x2B99, 0x2B99 ), - 'head2down' => array ( 0x2B9B, 0x2B9B, 0x2B9B, 0x2B9B ), - 'lozenge' => array ( 0x2B27, 0x2B27, 0x2B27, 0x2B27 ), - 'lozenge4' => array ( 0x2B27, 0x2B27, 0x2B27, 0x2B27 ), - 'lozenge6' => array ( 0x29EB, 0x29EB, 0x29EB, 0x29EB ), - 'openbullet' => array ( 0x25E6, 0x25E6, 0x25E6, 0x25E6 ), - 'square2' => array ( 0x25AA, 0x25AA, 0x25AA, 0x25AA ), - 'square4' => array ( 0x25AA, 0x25AA, 0x25AA, 0x25AA ), - 'square6' => array ( 0x25A0, 0x25A0, 0x25A0, 0x25A0 ), - 'xrhombus' => array ( 0x2756, 0x2756, 0x2756, 0x2756 ), - - // "Entities" found in some documents, but their name made it difficult to locate the entity reference - // within the PDF file ; their names are not meaningful enough to extrapolate their Unicode equivalent : - // .null - // [aAoO].superior - // allah - // apple - // arrowhorizex - // bari.dotless - // circumflex.arab - // cyrillic_otmark - // dot.one, dot.twohoriz, dot.threeup, dot.twovert, dot.four - // f02d - // Gxx, which do not seem to function as /gxx - // glyphxxx - // Ldot and ldot (didn't found the Unicode name) - // lillah - // noxxx, where 'xxx' is a Greek letter name - // nonmarkingreturn - // patah.wide - // pi1 - // ryial - // smallv - // UIforward - // vdaggerdbl - // wasla - // wavyhamza - // zero.slash - ) ; + array ( 0101, 0101, 0101, 0101 ), + 'AE' => array ( 0341, 0256, 0306, 0306 ), + 'Aacute' => array ( 0, 0347, 0301, 0301 ), + 'Acircumflex' => array ( 0, 0345, 0302, 0302 ), + 'Adieresis' => array ( 0, 0200, 0304, 0304 ), + 'Agrave' => array ( 0, 0313, 0300, 0300 ), + 'Aring' => array ( 0, 0201, 0305, 0305 ), + 'Atilde' => array ( 0, 0314, 0303, 0303 ), + 'B' => array ( 0102, 0102, 0102, 0102 ), + 'C' => array ( 0103, 0103, 0103, 0103 ), + 'Ccedilla' => array ( 0, 0202, 0307, 0307 ), + 'D' => array ( 0104, 0104, 0104, 0104 ), + 'E' => array ( 0105, 0105, 0105, 0105 ), + 'Eacute' => array ( 0, 0203, 0311, 0311 ), + 'Ecircumflex' => array ( 0, 0346, 0312, 0312 ), + 'Edieresis' => array ( 0, 0350, 0313, 0313 ), + 'Egrave' => array ( 0, 0351, 0310, 0310 ), + 'Eth' => array ( 0, 0, 0320, 0320 ), + 'Euro' => array ( 0, 0, 0200, 0240 ), + 'F' => array ( 0106, 0106, 0106, 0106 ), + 'G' => array ( 0107, 0107, 0107, 0107 ), + 'H' => array ( 0110, 0110, 0110, 0110 ), + 'I' => array ( 0111, 0111, 0111, 0111 ), + 'Iacute' => array ( 0, 0352, 0315, 0315 ), + 'Icircumflex' => array ( 0, 0353, 0316, 0316 ), + 'Idieresis' => array ( 0, 0354, 0317, 0317 ), + 'Igrave' => array ( 0, 0355, 0314, 0314 ), + 'J' => array ( 0112, 0112, 0112, 0112 ), + 'K' => array ( 0113, 0113, 0113, 0113 ), + 'L' => array ( 0114, 0114, 0114, 0114 ), + 'Lslash' => array ( 0x0141, 0x0141, 0x0141, 0x0141 ), + 'M' => array ( 0115, 0115, 0115, 0115 ), + 'N' => array ( 0116, 0116, 0116, 0116 ), + 'Ntilde' => array ( 0, 0204, 0321, 0321 ), + 'O' => array ( 0117, 0117, 0117, 0117 ), + 'OE' => array ( 0352, 0316, 0214, 0226 ), + 'Oacute' => array ( 0, 0356, 0323, 0323 ), + 'Ocircumflex' => array ( 0, 0357, 0324, 0324 ), + 'Odieresis' => array ( 0, 0205, 0326, 0326 ), + 'Ograve' => array ( 0, 0361, 0322, 0322 ), + 'Oslash' => array ( 0351, 0257, 0330, 0330 ), + 'Otilde' => array ( 0, 0315, 0325, 0325 ), + 'P' => array ( 0120, 0120, 0120, 0120 ), + 'Q' => array ( 0121, 0121, 0121, 0121 ), + 'R' => array ( 0122, 0122, 0122, 0122 ), + 'S' => array ( 0123, 0123, 0123, 0123 ), + 'Scaron' => array ( 0, 0, 0212, 0227 ), + 'T' => array ( 0124, 0124, 0124, 0124 ), + 'Thorn' => array ( 0, 0, 0336, 0336 ), + 'U' => array ( 0125, 0125, 0125, 0125 ), + 'Uacute' => array ( 0, 0362, 0332, 0332 ), + 'Ucircumflex' => array ( 0, 0363, 0333, 0333 ), + 'Udieresis' => array ( 0, 0206, 0334, 0334 ), + 'Ugrave' => array ( 0, 0364, 0331, 0331 ), + 'V' => array ( 0126, 0126, 0126, 0126 ), + 'W' => array ( 0127, 0127, 0127, 0127 ), + 'X' => array ( 0130, 0130, 0130, 0130 ), + 'Y' => array ( 0131, 0131, 0131, 0131 ), + 'Yacute' => array ( 0, 0, 0335, 0335 ), + 'Ydieresis' => array ( 0, 0331, 0237, 0230 ), + 'Z' => array ( 0132, 0132, 0132, 0132 ), + 'Zcaron' => array ( 0, 0, 0216, 0231 ), + 'a' => array ( 0141, 0141, 0141, 0141 ), + 'aacute' => array ( 0, 0207, 0341, 0341 ), + 'acircumflex' => array ( 0, 0211, 0342, 0342 ), + 'acute' => array ( 0302, 0253, 0264, 0264 ), + 'adieresis' => array ( 0, 0212, 0344, 0344 ), + 'ae' => array ( 0361, 0276, 0346, 0346 ), + 'agrave' => array ( 0, 0210, 0340, 0340 ), + 'ampersand' => array ( 0046, 0046, 0046, 0046 ), + 'aring' => array ( 0, 0214, 0345, 0345 ), + 'asciicircum' => array ( 0136, 0136, 0136, 0136 ), + 'asciitilde' => array ( 0176, 0176, 0176, 0176 ), + 'asterisk' => array ( 0052, 0052, 0052, 0052 ), + 'at' => array ( 0100, 0100, 0100, 0100 ), + 'atilde' => array ( 0, 0213, 0343, 0343 ), + 'b' => array ( 0142, 0142, 0142, 0142 ), + 'backslash' => array ( 0134, 0134, 0134, 0134 ), + 'bar' => array ( 0174, 0174, 0174, 0174 ), + 'braceleft' => array ( 0173, 0173, 0173, 0173 ), + 'braceright' => array ( 0175, 0175, 0175, 0175 ), + 'bracketleft' => array ( 0133, 0133, 0133, 0133 ), + 'bracketright' => array ( 0135, 0135, 0135, 0135 ), + 'breve' => array ( 0306, 0371, 0, 0030 ), + 'brokenbar' => array ( 0, 0, 0246, 0246 ), + 'bullet' => array ( 0267, 0245, 0225, 0200 ), + 'c' => array ( 0143, 0143, 0143, 0143 ), + 'caron' => array ( 0317, 0377, 0, 0031 ), + 'ccedilla' => array ( 0, 0215, 0347, 0347 ), + 'cedilla' => array ( 0313, 0374, 0270, 0270 ), + 'cent' => array ( 0242, 0242, 0242, 0242 ), + 'circumflex' => array ( 0303, 0366, 0210, 0032 ), + 'colon' => array ( 0072, 0072, 0072, 0072 ), + 'comma' => array ( 0054, 0054, 0054, 0054 ), + 'copyright' => array ( 0, 0251, 0251, 0251 ), + 'currency' => array ( 0250, 0333, 0244, 0244 ), + 'd' => array ( 0144, 0144, 0144, 0144 ), + 'dagger' => array ( 0262, 0240, 0206, 0201 ), + 'daggerdbl' => array ( 0263, 0340, 0207, 0202 ), + 'degree' => array ( 0, 0241, 0260, 0260 ), + 'dieresis' => array ( 0310, 0254, 0250, 0250 ), + 'divide' => array ( 0, 0326, 0367, 0367 ), + 'dollar' => array ( 0044, 0044, 0044, 0044 ), + 'dotaccent' => array ( 0307, 0372, 0, 0033 ), + 'dotlessi' => array ( 0365, 0365, 0x131, 0232 ), + 'e' => array ( 0145, 0145, 0145, 0145 ), + 'eacute' => array ( 0, 0216, 0351, 0351 ), + 'ecircumflex' => array ( 0, 0220, 0352, 0352 ), + 'edieresis' => array ( 0, 0221, 0353, 0353 ), + 'egrave' => array ( 0, 0217, 0350, 0350 ), + 'eight' => array ( 0070, 0070, 0070, 0070 ), + 'elipsis' => array ( 0x2026, 0x2026, 0x2026, 0x2026 ), + 'ellipsis' => array ( 0x2026, 0x2026, 0x2026, 0x2026 ), + 'emdash' => array ( 0x2D, 0x2D, 0x2D, 0x2D ), + 'endash' => array ( 0x2D, 0x2D, 0x2D, 0x2D ), + 'equal' => array ( 0075, 0075, 0075, 0075 ), + 'eth' => array ( 0, 0, 0360, 0360 ), + 'exclam' => array ( 0041, 0041, 0041, 0041 ), + 'exclamdown' => array ( 0241, 0301, 0241, 0241 ), + 'f' => array ( 0146, 0146, 0146, 0146 ), + 'fi' => array ( 0xFB01, 0xFB01, 0xFB01, 0xFB01 ), + 'five' => array ( 0065, 0065, 0065, 0065 ), + 'ff' => array ( 0xFB00, 0xFB00, 0xFB00, 0xFB00 ), + 'fl' => array ( 0xFB02, 0xFB02, 0xFB02, 0xFB02 ), + 'ffi' => array ( 0xFB03, 0xFB03, 0xFB03, 0xFB03 ), + 'ffl' => array ( 0xFB04, 0xFB04, 0xFB04, 0xFB04 ), + 'florin' => array ( 0246, 0304, 0203, 0206 ), + 'four' => array ( 0064, 0064, 0064, 0064 ), + 'fraction' => array ( 0244, 0332, 0, 0207 ), + 'g' => array ( 0147, 0147, 0147, 0147 ), + 'germandbls' => array ( 0373, 0247, 0337, 0337 ), + 'grave' => array ( 0301, 0140, 0140, 0140 ), + 'greater' => array ( 0076, 0076, 0076, 0076 ), + 'guillemotleft' => array ( 0253, 0307, 0253, 0253 ), + 'guillemotright' => array ( 0273, 0310, 0273, 0273 ), + 'guilsinglleft' => array ( 0254, 0334, 0213, 0210 ), + 'guilsinglright' => array ( 0255, 0335, 0233, 0211 ), + 'h' => array ( 0150, 0150, 0150, 0150 ), + 'hungarumlaut' => array ( 0315, 0375, 0, 0034 ), + 'hyphen' => array ( 0x2D, 0x2D, 0x2D, 0x2D ), + 'i' => array ( 0151, 0151, 0151, 0151 ), + 'iacute' => array ( 0, 0222, 0355, 0355 ), + 'icircumflex' => array ( 0, 0224, 0356, 0356 ), + 'idieresis' => array ( 0, 0225, 0357, 0357 ), + 'igrave' => array ( 0, 0223, 0354, 0354 ), + 'j' => array ( 0152, 0152, 0152, 0152 ), + 'k' => array ( 0153, 0153, 0153, 0153 ), + 'l' => array ( 0154, 0154, 0154, 0154 ), + 'less' => array ( 0074, 0074, 0074, 0074 ), + 'logicalnot' => array ( 0, 0302, 0254, 0254 ), + 'lslash' => array ( 0x0142, 0x0142, 0x0142, 0x0142 ), + 'm' => array ( 0155, 0155, 0155, 0155 ), + 'macron' => array ( 0305, 0370, 0257, 0257 ), + 'minus' => array ( 0x2D, 0x2D, 0x2D, 0x2D ), + 'mu' => array ( 0, 0265, 0265, 0265 ), + 'multiply' => array ( 0, 0, 0327, 0327 ), + 'n' => array ( 0156, 0156, 0156, 0156 ), + 'nine' => array ( 0071, 0071, 0071, 0071 ), + 'ntilde' => array ( 0, 0226, 0361, 0361 ), + 'numbersign' => array ( 0043, 0043, 0043, 0043 ), + 'o' => array ( 0157, 0157, 0157, 0157 ), + 'oacute' => array ( 0, 0227, 0363, 0363 ), + 'ocircumflex' => array ( 0, 0231, 0364, 0364 ), + 'odieresis' => array ( 0, 0232, 0366, 0366 ), + 'oe' => array ( 0372, 0317, 0234, 0234 ), + 'ogonek' => array ( 0x2DB, 0x2DB, 0x2DB, 0x2DB ), + 'ograve' => array ( 0, 0230, 0362, 0362 ), + 'one' => array ( 0061, 0061, 0061, 0061 ), + 'onehalf' => array ( 0, 0, 0275, 0275 ), + 'onequarter' => array ( 0, 0, 0274, 0274 ), + 'ordfeminine' => array ( 0343, 0273, 0252, 0252 ), + 'ordmasculine' => array ( 0353, 0274, 0272, 0272 ), + 'oslash' => array ( 0371, 0277, 0370, 0370 ), + 'otilde' => array ( 0, 0233, 0365, 0365 ), + 'p' => array ( 0160, 0160, 0160, 0160 ), + 'paragraph' => array ( 0266, 0246, 0266, 0266 ), + 'parenleft' => array ( 0050, 0050, 0050, 0050 ), + 'parenright' => array ( 0051, 0051, 0051, 0051 ), + 'percent' => array ( 0045, 0045, 0045, 0045 ), + 'period' => array ( 0056, 0056, 0056, 0056 ), + 'periodcentered' => array ( 0264, 0341, 0267, 0267 ), + 'perthousand' => array ( 0275, 0344, 0211, 0213 ), + 'plus' => array ( 0053, 0053, 0053, 0053 ), + 'plusminus' => array ( 0, 0261, 0261, 0261 ), + 'q' => array ( 0161, 0161, 0161, 0161 ), + 'question' => array ( 0077, 0077, 0077, 0077 ), + 'questiondown' => array ( 0277, 0300, 0277, 0277 ), + 'quotedbl' => array ( 0x22, 0x22, 0x22, 0x22 ), + 'quotedblbase' => array ( 0x22, 0x22, 0x22, 0x22 ), + 'quotedblleft' => array ( 0x22, 0x22, 0x22, 0x22 ), + 'quotedblright' => array ( 0x22, 0x22, 0x22, 0x22 ), + 'quoteleft' => array ( 0x27, 0x27, 0x27, 0x27 ), + 'quoteright' => array ( 0x22, 0x22, 0x22, 0x22 ), + 'quotesinglbase' => array ( 0x22, 0x22, 0x22, 0x22 ), + 'quotesingle' => array ( 0x22, 0x22, 0x22, 0x22 ), + 'r' => array ( 0162, 0162, 0162, 0162 ), + 'registered' => array ( 0, 0250, 0256, 0256 ), + 'ring' => array ( 0312, 0373, 0xB0, 0036 ), + 's' => array ( 0163, 0163, 0163, 0163 ), + 'scaron' => array ( 0, 0, 0232, 0235 ), + 'section' => array ( 0247, 0244, 0247, 0247 ), + 'semicolon' => array ( 0073, 0073, 0073, 0073 ), + 'seven' => array ( 0067, 0067, 0067, 0067 ), + 'six' => array ( 0066, 0066, 0066, 0066 ), + 'slash' => array ( 0057, 0057, 0057, 0057 ), + 'space' => array ( 0040, 0040, 0040, 0040 ), + 'sterling' => array ( 0243, 0243, 0243, 0243 ), + 't' => array ( 0164, 0164, 0164, 0164 ), + 'thorn' => array ( 0, 0, 0376, 0376 ), + 'three' => array ( 0063, 0063, 0063, 0063 ), + 'threequarters' => array ( 0, 0, 0276, 0276 ), + 'tilde' => array ( 0304, 0367, 0230, 0037 ), + 'trademark' => array ( 0, 0252, 0231, 0222 ), + 'two' => array ( 0062, 0062, 0062, 0062 ), + 'u' => array ( 0165, 0165, 0165, 0165 ), + 'uacute' => array ( 0, 0234, 0372, 0372 ), + 'ucircumflex' => array ( 0, 0236, 0373, 0373 ), + 'udieresis' => array ( 0, 0237, 0374, 0374 ), + 'ugrave' => array ( 0, 0235, 0371, 0371 ), + 'underscore' => array ( 0137, 0137, 0137, 0137 ), + 'v' => array ( 0166, 0166, 0166, 0166 ), + 'w' => array ( 0167, 0167, 0167, 0167 ), + 'x' => array ( 0170, 0170, 0170, 0170 ), + 'y' => array ( 0171, 0171, 0171, 0171 ), + 'yacute' => array ( 0, 0, 0375, 0375 ), + 'ydieresis' => array ( 0, 0330, 0377, 0377 ), + 'yen' => array ( 0245, 0264, 0245, 0245 ), + 'z' => array ( 0172, 0172, 0172, 0172 ), + 'zcaron' => array ( 0, 0, 0236, 0236 ), + 'zero' => array ( 0060, 0060, 0060, 0060 ), + + // Additions which are not described in the PDF specifications - much more foreign characters are available ! + // (see https://mupdf.com/docs/browse/source/pdf/pdf-glyphlist.h.html) + // The following also gives some glyph names : + // http://www.tipometar.org/pojmovnik/Hint/img/Using%20Fontographer.pdf + // This table is currently far from being complete + 'Abreve' => array ( 0x0102, 0x0102, 0x0102, 0x0102 ), + 'abreve' => array ( 0x0103, 0x0103, 0x0103, 0x0103 ), + 'Abreveacute' => array ( 0x1EAE, 0x1EAE, 0x1EAE, 0x1EAE ), + 'abreveacute' => array ( 0x1EAF, 0x1EAF, 0x1EAF, 0x1EAF ), + 'Abrevedotbelow' => array ( 0x1EB6, 0x1EB6, 0x1EB6, 0x1EB6 ), + 'abrevedotbelow' => array ( 0x1EB7, 0x1EB7, 0x1EB7, 0x1EB7 ), + 'Abrevegrave' => array ( 0x1EB0, 0x1EB0, 0x1EB0, 0x1EB0 ), + 'abrevegrave' => array ( 0x1EB1, 0x1EB1, 0x1EB1, 0x1EB1 ), + 'Abrevehookabove' => array ( 0x1EB2, 0x1EB2, 0x1EB2, 0x1EB2 ), + 'abrevehookabove' => array ( 0x1EB3, 0x1EB3, 0x1EB3, 0x1EB3 ), + 'Abrevetilde' => array ( 0x1EB4, 0x1EB4, 0x1EB4, 0x1EB4 ), + 'abrevetilde' => array ( 0x1EB5, 0x1EB5, 0x1EB5, 0x1EB5 ), + 'Acaron' => array ( 0x01CD, 0x01CD, 0x01CD, 0x01CD ), + 'acaron' => array ( 0x01CE, 0x01CE, 0x01CE, 0x01CE ), + 'Acircumflexacute' => array ( 0x1EA4, 0x1EA4, 0x1EA4, 0x1EA4 ), + 'acircumflexacute' => array ( 0x1EA5, 0x1EA5, 0x1EA5, 0x1EA5 ), + 'Acircumflexdotbelow' => array ( 0x1EAC, 0x1EAC, 0x1EAC, 0x1EAC ), + 'acircumflexdotbelow' => array ( 0x1EAD, 0x1EAD, 0x1EAD, 0x1EAD ), + 'Acircumflexgrave' => array ( 0x1EA6, 0x1EA6, 0x1EA6, 0x1EA6 ), + 'acircumflexgrave' => array ( 0x1EA7, 0x1EA7, 0x1EA7, 0x1EA7 ), + 'Acircumflexhookabove' => array ( 0x1EA8, 0x1EA8, 0x1EA8, 0x1EA8 ), + 'acircumflexhookabove' => array ( 0x1EA9, 0x1EA9, 0x1EA9, 0x1EA9 ), + 'Acircumflextilde' => array ( 0x1EAA, 0x1EAA, 0x1EAA, 0x1EAA ), + 'acircumflextilde' => array ( 0x1EAB, 0x1EAB, 0x1EAB, 0x1EAB ), + 'acutecomb' => array ( 0x0301, 0x0301, 0x0301, 0x0301 ), + 'Adot' => array ( 0x0226, 0x0226, 0x0226, 0x0226 ), + 'adot' => array ( 0x0227, 0x0227, 0x0227, 0x0227 ), + 'Adotbelow' => array ( 0x1EA0, 0x1EA0, 0x1EA0, 0x1EA0 ), + 'adotbelow' => array ( 0x1EA1, 0x1EA1, 0x1EA1, 0x1EA1 ), + 'AEacute' => array ( 0x01FC, 0x01FC, 0x01FC, 0x01FC ), + 'aeacute' => array ( 0x01FD, 0x01FD, 0x01FD, 0x01FD ), + 'Adieresis' => array ( 0x00C4, 0x00C4, 0x00C4, 0x00C4 ), + 'adieresis' => array ( 0x00E4, 0x00E4, 0x00E4, 0x00E4 ), + 'Ahookabove' => array ( 0x1EA2, 0x1EA2, 0x1EA2, 0x1EA2 ), + 'ahookabove' => array ( 0x1EA3, 0x1EA3, 0x1EA3, 0x1EA3 ), + 'Amacron' => array ( 0x0100, 0x0100, 0x0100, 0x0100 ), + 'amacron' => array ( 0x0101, 0x0101, 0x0101, 0x0101 ), + 'Aogonek' => array ( 0x0104, 0x0104, 0x0104, 0x0104 ), + 'aogonek' => array ( 0x0105, 0x0105, 0x0105, 0x0105 ), + 'Aring' => array ( 0x00C5, 0x00C5, 0x00C5, 0x00C5 ), + 'aring' => array ( 0x00E5, 0x00E5, 0x00E5, 0x00E5 ), + 'Aringacute' => array ( 0x01FA, 0x01FA, 0x01FA, 0x01FA ), + 'aringacute' => array ( 0x01FB, 0x01FB, 0x01FB, 0x01FB ), + 'Atilde' => array ( 0x00C3, 0x00C3, 0x00C3, 0x00C3 ), + 'atilde' => array ( 0x00E3, 0x00E3, 0x00E3, 0x00E3 ), + 'Cacute' => array ( 0x0106, 0x0106, 0x0106, 0x0106 ), + 'cacute' => array ( 0x0107, 0x0107, 0x0107, 0x0107 ), + 'Ccaron' => array ( 0x010C, 0x010C, 0x010C, 0x010C ), + 'ccaron' => array ( 0x010D, 0x010D, 0x010D, 0x010D ), + 'Ccircumflex' => array ( 0x0108, 0x0108, 0x0108, 0x0108 ), + 'ccircumflex' => array ( 0x0109, 0x0109, 0x0109, 0x0109 ), + 'Cdot' => array ( 0x010A, 0x010A, 0x010A, 0x010A ), + 'cdot' => array ( 0x010B, 0x010B, 0x010B, 0x010B ), + 'Cdotaccent' => array ( 0x010A, 0x010A, 0x010A, 0x010A ), + 'cdotaccent' => array ( 0x010B, 0x010B, 0x010B, 0x010B ), + 'Dcaron' => array ( 0x010E, 0x010E, 0x010E, 0x010E ), + 'dcaron' => array ( 0x010F, 0x010F, 0x010F, 0x010F ), + 'Dcedilla' => array ( 0x1E10, 0x1E10, 0x1E10, 0x1E10 ), + 'dcedilla' => array ( 0x1E11, 0x1E11, 0x1E11, 0x1E11 ), + 'Dcroat' => array ( 0x0110, 0x0110, 0x0110, 0x0110 ), + 'dcroat' => array ( 0x0111, 0x0111, 0x0111, 0x0111 ), + 'Dmacron' => array ( 0x0110, 0x0110, 0x0110, 0x0110 ), + 'dmacron' => array ( 0x0111, 0x0111, 0x0111, 0x0111 ), + 'dotbelowcomb' => array ( 0x0323, 0x0323, 0x0323, 0x0323 ), + 'Dslash' => array ( 0x0110, 0x0110, 0x0110, 0x0110 ), + 'dslash' => array ( 0x0111, 0x0111, 0x0111, 0x0111 ), + 'Ebreve' => array ( 0x0114, 0x0114, 0x0114, 0x0114 ), + 'ebreve' => array ( 0x0115, 0x0115, 0x0115, 0x0115 ), + 'Ecaron' => array ( 0x011A, 0x011A, 0x011A, 0x011A ), + 'ecaron' => array ( 0x011B, 0x011B, 0x011B, 0x011B ), + 'Ecedilla' => array ( 0x0228, 0x0228, 0x0228, 0x0228 ), + 'ecedilla' => array ( 0x0229, 0x0229, 0x0229, 0x0229 ), + 'Ecircumflexacute' => array ( 0x1EBE, 0x1EBE, 0x1EBE, 0x1EBE ), + 'ecircumflexacute' => array ( 0x1EBF, 0x1EBF, 0x1EBF, 0x1EBF ), + 'Ecircumflexdotbelow' => array ( 0x1EC6, 0x1EC6, 0x1EC6, 0x1EC6 ), + 'ecircumflexdotbelow' => array ( 0x1EC7, 0x1EC7, 0x1EC7, 0x1EC7 ), + 'Ecircumflexgrave' => array ( 0x1EC0, 0x1EC0, 0x1EC0, 0x1EC0 ), + 'ecircumflexgrave' => array ( 0x1EC1, 0x1EC1, 0x1EC1, 0x1EC1 ), + 'Ecircumflexhookabove' => array ( 0x1EC2, 0x1EC2, 0x1EC2, 0x1EC2 ), + 'ecircumflexhookabove' => array ( 0x1EC3, 0x1EC3, 0x1EC3, 0x1EC3 ), + 'Ecircumflextilde' => array ( 0x1EC4, 0x1EC4, 0x1EC4, 0x1EC4 ), + 'ecircumflextilde' => array ( 0x1EC5, 0x1EC5, 0x1EC5, 0x1EC5 ), + 'Edieresis' => array ( 0x00CB, 0x00CB, 0x00CB, 0x00CB ), + 'edieresis' => array ( 0x00EB, 0x00EB, 0x00EB, 0x00EB ), + 'Edot' => array ( 0x0116, 0x0116, 0x0116, 0x0116 ), + 'edot' => array ( 0x0117, 0x0117, 0x0117, 0x0117 ), + 'Edotaccent' => array ( 0x0116, 0x0116, 0x0116, 0x0116 ), + 'edotaccent' => array ( 0x0117, 0x0117, 0x0117, 0x0117 ), + 'Edotbelow' => array ( 0x1EB8, 0x1EB8, 0x1EB8, 0x1EB8 ), + 'edotbelow' => array ( 0x1EB9, 0x1EB9, 0x1EB9, 0x1EB9 ), + 'Ehookabove' => array ( 0x1EBA, 0x1EBA, 0x1EBA, 0x1EBA ), + 'ehookabove' => array ( 0x1EBB, 0x1EBB, 0x1EBB, 0x1EBB ), + 'Emacron' => array ( 0x0112, 0x0112, 0x0112, 0x0112 ), + 'emacron' => array ( 0x0113, 0x0113, 0x0113, 0x0113 ), + 'Eng' => array ( 0x014A, 0x014A, 0x014A, 0x014A ), + 'eng' => array ( 0x014B, 0x014B, 0x014B, 0x014B ), + 'Eogonek' => array ( 0x0118, 0x0118, 0x0118, 0x0118 ), + 'eogonek' => array ( 0x0119, 0x0119, 0x0119, 0x0119 ), + 'Ering' => array ( 0x016E, 0x016E, 0x016E, 0x016E ), + 'ering' => array ( 0x016F, 0x016F, 0x016F, 0x016F ), + 'Etilde' => array ( 0x1EBC, 0x1EBC, 0x1EBC, 0x1EBC ), + 'etilde' => array ( 0x1EBD, 0x1EBD, 0x1EBD, 0x1EBD ), + 'Gacute' => array ( 0x01F4, 0x01F4, 0x01F4, 0x01F4 ), + 'gacute' => array ( 0x01F5, 0x01F5, 0x01F5, 0x01F5 ), + 'Gbreve' => array ( 0x011E, 0x011E, 0x011E, 0x011E ), + 'gbreve' => array ( 0x011F, 0x011F, 0x011F, 0x011F ), + 'Gcaron' => array ( 0x01E6, 0x01E6, 0x01E6, 0x01E6 ), + 'gcaron' => array ( 0x01E7, 0x01E7, 0x01E7, 0x01E7 ), + 'Gcedilla' => array ( 0x0122, 0x0122, 0x0122, 0x0122 ), + 'gcedilla' => array ( 0x0123, 0x0123, 0x0123, 0x0123 ), + 'Gcommaaccent' => array ( 0x0122, 0x0122, 0x0122, 0x0122 ), + 'gcommaaccent' => array ( 0x0123, 0x0123, 0x0123, 0x0123 ), + 'Gcircumflex' => array ( 0x011C, 0x011C, 0x011C, 0x011C ), + 'gcircumflex' => array ( 0x011D, 0x011D, 0x011D, 0x011D ), + 'Gdot' => array ( 0x0120, 0x0120, 0x0120, 0x0120 ), + 'gdot' => array ( 0x0121, 0x0121, 0x0121, 0x0121 ), + 'Gdotaccent' => array ( 0x0120, 0x0120, 0x0120, 0x0120 ), + 'gdotaccent' => array ( 0x0121, 0x0121, 0x0121, 0x0121 ), + 'gravecomb' => array ( 0x0300, 0x0300, 0x0300, 0x0300 ), + 'Hbar' => array ( 0x0126, 0x0126, 0x0126, 0x0126 ), + 'hbar' => array ( 0x0127, 0x0127, 0x0127, 0x0127 ), + 'Hcaron' => array ( 0x021E, 0x021E, 0x021E, 0x021E ), + 'hcaron' => array ( 0x021F, 0x021F, 0x021F, 0x021F ), + 'Hcedilla' => array ( 0x1E28, 0x1E28, 0x1E28, 0x1E28 ), + 'hcedilla' => array ( 0x1E29, 0x1E29, 0x1E29, 0x1E29 ), + 'Hcircumflex' => array ( 0x0124, 0x0124, 0x0124, 0x0124 ), + 'hcircumflex' => array ( 0x0125, 0x0125, 0x0125, 0x0125 ), + 'hookabovecomb' => array ( 0x0309, 0x0309, 0x0309, 0x0309 ), + 'Ibreve' => array ( 0x012C, 0x012C, 0x012C, 0x012C ), + 'ibreve' => array ( 0x012D, 0x012D, 0x012D, 0x012D ), + 'Icaron' => array ( 0x01CF, 0x01CF, 0x01CF, 0x01CF ), + 'icaron' => array ( 0x01D0, 0x01D0, 0x01D0, 0x01D0 ), + 'Idieresis' => array ( 0x00CF, 0x00CF, 0x00CF, 0x00CF ), + 'idieresis' => array ( 0x00EF, 0x00EF, 0x00EF, 0x00EF ), + 'Idot' => array ( 0x00CD, 0x00CD, 0x00CD, 0x00CD ), + 'idot' => array ( 0x00ED, 0x00ED, 0x00ED, 0x00ED ), + 'Idotaccent' => array ( 0x00CD, 0x00CD, 0x00CD, 0x00CD ), + 'idotaccent' => array ( 0x00ED, 0x00ED, 0x00ED, 0x00ED ), + 'Idotbelow' => array ( 0x1ECA, 0x1ECA, 0x1ECA, 0x1ECA ), + 'idotbelow' => array ( 0x1ECB, 0x1ECB, 0x1ECB, 0x1ECB ), + 'Ihookabove' => array ( 0x1EC8, 0x1EC8, 0x1EC8, 0x1EC8 ), + 'ihookabove' => array ( 0x1EC9, 0x1EC9, 0x1EC9, 0x1EC9 ), + 'IJ' => array ( 0x0132, 0x0132, 0x0132, 0x0132 ), + 'ij' => array ( 0x0133, 0x0133, 0x0133, 0x0133 ), + 'Imacron' => array ( 0x012A, 0x012A, 0x012A, 0x012A ), + 'imacron' => array ( 0x012B, 0x012B, 0x012B, 0x012B ), + 'Iogonek' => array ( 0x012E, 0x012E, 0x012E, 0x012E ), + 'iogonek' => array ( 0x012F, 0x012F, 0x012F, 0x012F ), + 'Itilde' => array ( 0x0128, 0x0128, 0x0128, 0x0128 ), + 'itilde' => array ( 0x0129, 0x0129, 0x0129, 0x0129 ), + 'Jcaron' => array ( 0x01F0, 0x01F0, 0x01F0, 0x01F0 ), + 'jcaron' => array ( 0x01EF, 0x01EF, 0x01EF, 0x01EF ), + 'Jcircumflex' => array ( 0x0134, 0x0134, 0x0134, 0x0134 ), + 'jcircumflex' => array ( 0x0135, 0x0135, 0x0135, 0x0135 ), + 'Kacute' => array ( 0x1E30, 0x1E30, 0x1E30, 0x1E30 ), + 'kacute' => array ( 0x1E31, 0x1E31, 0x1E31, 0x1E31 ), + 'kcaron' => array ( 0x01E9, 0x01E9, 0x01E9, 0x01E9 ), + 'Kcaron' => array ( 0x01E8, 0x01E8, 0x01E8, 0x01E8 ), + 'Kcedilla' => array ( 0x0136, 0x0136, 0x0136, 0x0136 ), + 'kcedilla' => array ( 0x0137, 0x0137, 0x0137, 0x0137 ), + 'Kcommaaccent' => array ( 0x0136, 0x0136, 0x0136, 0x0136 ), + 'kcommaaccent' => array ( 0x0137, 0x0137, 0x0137, 0x0137 ), + 'kgreenlandic' => array ( 0x0138, 0x0138, 0x0138, 0x0138 ), + 'Lacute' => array ( 0x0139, 0x0139, 0x0139, 0x0139 ), + 'lacute' => array ( 0x013A, 0x013A, 0x013A, 0x013A ), + 'lcaron' => array ( 0x013E, 0x013E, 0x013E, 0x013E ), + 'Lcaron' => array ( 0x013D, 0x013D, 0x013D, 0x013D ), + 'Lcedilla' => array ( 0x013B, 0x013B, 0x013B, 0x013B ), + 'lcedilla' => array ( 0x013C, 0x013C, 0x013C, 0x013C ), + 'Lcommaaccent' => array ( 0x013B, 0x013B, 0x013B, 0x013B ), + 'lcommaaccent' => array ( 0x013C, 0x013C, 0x013C, 0x013C ), + 'Ldot' => array ( 0x013F, 0x013F, 0x013F, 0x013F ), + 'ldot' => array ( 0x0140, 0x0140, 0x0140, 0x0140 ), + 'Macute' => array ( 0x1E3E, 0x1E3E, 0x1E3E, 0x1E3E ), + 'macute' => array ( 0x1E3F, 0x1E3F, 0x1E3F, 0x1E3F ), + 'nacute' => array ( 0x0144, 0x0144, 0x0144, 0x0144 ), + 'Nacute' => array ( 0x0143, 0x0143, 0x0143, 0x0143 ), + 'napostrophe' => array ( 0x0149, 0x0149, 0x0149, 0x0149 ), + 'nbspace' => array ( 0x0020, 0x0020, 0x0020, 0x0020 ), + 'Ncaron' => array ( 0x0147, 0x0147, 0x0147, 0x0147 ), + 'ncaron' => array ( 0x0148, 0x0148, 0x0148, 0x0148 ), + 'Ncedilla' => array ( 0x0145, 0x0145, 0x0145, 0x0145 ), + 'ncedilla' => array ( 0x0146, 0x0146, 0x0146, 0x0146 ), + 'Ncommaaccent' => array ( 0x0145, 0x0145, 0x0145, 0x0145 ), + 'ncommaaccent' => array ( 0x0146, 0x0146, 0x0146, 0x0146 ), + 'Ncircumflex' => array ( 0x1E4A, 0x1E4A, 0x1E4A, 0x1E4A ), + 'ncircumflex' => array ( 0x1E4B, 0x1E4B, 0x1E4B, 0x1E4B ), + 'Ntilde' => array ( 0x00D1, 0x00D1, 0x00D1, 0x00D1 ), + 'ntilde' => array ( 0x00F1, 0x00F1, 0x00F1, 0x00F1 ), + 'Obreve' => array ( 0x014E, 0x014E, 0x014E, 0x014E ), + 'obreve' => array ( 0x014F, 0x014F, 0x014F, 0x014F ), + 'Ocaron' => array ( 0x01D1, 0x01D1, 0x01D1, 0x01D1 ), + 'ocaron' => array ( 0x01D2, 0x01D2, 0x01D2, 0x01D2 ), + 'Ocedilla' => array ( 0x0156, 0x0156, 0x0156, 0x0156 ), + 'ocedilla' => array ( 0x0157, 0x0157, 0x0157, 0x0157 ), + 'Ocircumflexacute' => array ( 0x1ED0, 0x1ED0, 0x1ED0, 0x1ED0 ), + 'ocircumflexacute' => array ( 0x1ED1, 0x1ED1, 0x1ED1, 0x1ED1 ), + 'Ocircumflexdotbelow' => array ( 0x1ED8, 0x1ED8, 0x1ED8, 0x1ED8 ), + 'ocircumflexdotbelow' => array ( 0x1ED9, 0x1ED9, 0x1ED9, 0x1ED9 ), + 'Ocircumflexgrave' => array ( 0x1ED2, 0x1ED2, 0x1ED2, 0x1ED2 ), + 'ocircumflexgrave' => array ( 0x1ED3, 0x1ED3, 0x1ED3, 0x1ED3 ), + 'Ocircumflexhookabove' => array ( 0x1ED4, 0x1ED4, 0x1ED4, 0x1ED4 ), + 'ocircumflexhookabove' => array ( 0x1ED5, 0x1ED5, 0x1ED5, 0x1ED5 ), + 'Ocircumflextilde' => array ( 0x1ED6, 0x1ED6, 0x1ED6, 0x1ED6 ), + 'ocircumflextilde' => array ( 0x1ED7, 0x1ED7, 0x1ED7, 0x1ED7 ), + 'Odieresis' => array ( 0x00D6, 0x00D6, 0x00D6, 0x00D6 ), + 'odieresis' => array ( 0x00F6, 0x00F6, 0x00F6, 0x00F6 ), + 'Odot' => array ( 0x022E, 0x022E, 0x022E, 0x022E ), + 'odot' => array ( 0x022F, 0x022F, 0x022F, 0x022F ), + 'Odotbelow' => array ( 0x1ECC, 0x1ECC, 0x1ECC, 0x1ECC ), + 'odotbelow' => array ( 0x1ECD, 0x1ECD, 0x1ECD, 0x1ECD ), + 'Odblacute' => array ( 0x0150, 0x0150, 0x0150, 0x0150 ), + 'odblacute' => array ( 0x0151, 0x0151, 0x0151, 0x0151 ), + 'Ohookabove' => array ( 0x1ECE, 0x1ECE, 0x1ECE, 0x1ECE ), + 'ohookabove' => array ( 0x1ECF, 0x1ECF, 0x1ECF, 0x1ECF ), + 'Ohorn' => array ( 0x01A0, 0x01A0, 0x01A0, 0x01A0 ), + 'ohorn' => array ( 0x01A1, 0x01A1, 0x01A1, 0x01A1 ), + 'Ohornacute' => array ( 0x1EDA, 0x1EDA, 0x1EDA, 0x1EDA ), + 'ohornacute' => array ( 0x1EDB, 0x1EDB, 0x1EDB, 0x1EDB ), + 'Ohorndotbelow' => array ( 0x1EE2, 0x1EE2, 0x1EE2, 0x1EE2 ), + 'ohorndotbelow' => array ( 0x1EE3, 0x1EE3, 0x1EE3, 0x1EE3 ), + 'Ohorngrave' => array ( 0x1EDC, 0x1EDC, 0x1EDC, 0x1EDC ), + 'ohorngrave' => array ( 0x1EDD, 0x1EDD, 0x1EDD, 0x1EDD ), + 'Ohornhookabove' => array ( 0x1EDE, 0x1EDE, 0x1EDE, 0x1EDE ), + 'ohornhookabove' => array ( 0x1EDF, 0x1EDF, 0x1EDF, 0x1EDF ), + 'Ohorntilde' => array ( 0x1EE0, 0x1EE0, 0x1EE0, 0x1EE0 ), + 'ohorntilde' => array ( 0x1EE1, 0x1EE1, 0x1EE1, 0x1EE1 ), + 'Ohungarumlaut' => array ( 0x0150, 0x0150, 0x0150, 0x0150 ), + 'ohungarumlaut' => array ( 0x0151, 0x0151, 0x0151, 0x0151 ), + 'omacron' => array ( 0x014C, 0x014C, 0x014C, 0x014C ), + 'Omacron' => array ( 0x014D, 0x014D, 0x014D, 0x014D ), + 'Oogonek' => array ( 0x01EA, 0x01EA, 0x01EA, 0x01EA ), + 'oogonek' => array ( 0x01EB, 0x01EB, 0x01EB, 0x01EB ), + 'Oslashacute' => array ( 0x01FE, 0x01FE, 0x01FE, 0x01FE ), + 'oslashacute' => array ( 0x01FF, 0x01FF, 0x01FF, 0x01FF ), + 'Otilde' => array ( 0x00D5, 0x00D5, 0x00D5, 0x00D5 ), + 'otilde' => array ( 0x00F5, 0x00F5, 0x00F5, 0x00F5 ), + 'overscore' => array ( 0x00AF, 0x00AF, 0x00AF, 0x00AF ), + 'Pacute' => array ( 0x1E54, 0x1E54, 0x1E54, 0x1E54 ), + 'pacute' => array ( 0x1E55, 0x1E55, 0x1E55, 0x1E55 ), + 'Racute' => array ( 0x0154, 0x0154, 0x0154, 0x0154 ), + 'racute' => array ( 0x0155, 0x0155, 0x0155, 0x0155 ), + 'Rcaron' => array ( 0x0158, 0x0158, 0x0158, 0x0158 ), + 'rcaron' => array ( 0x0159, 0x0159, 0x0159, 0x0159 ), + 'Rcedilla' => array ( 0x0156, 0x0156, 0x0156, 0x0156 ), + 'rcedilla' => array ( 0x0157, 0x0157, 0x0157, 0x0157 ), + 'Rcommaaccent' => array ( 0x0156, 0x0156, 0x0156, 0x0156 ), + 'rcommaaccent' => array ( 0x0157, 0x0157, 0x0157, 0x0157 ), + 'Sacute' => array ( 0x015A, 0x015A, 0x015A, 0x015A ), + 'sacute' => array ( 0x015B, 0x015B, 0x015B, 0x015B ), + 'Scaron' => array ( 0x0160, 0x0160, 0x0160, 0x0160 ), + 'scaron' => array ( 0x0161, 0x0161, 0x0161, 0x0161 ), + 'Scedilla' => array ( 0x015E, 0x015E, 0x015E, 0x015E ), + 'scedilla' => array ( 0x015F, 0x015F, 0x015F, 0x015F ), + 'Scircumflex' => array ( 0x015C, 0x015C, 0x015C, 0x015C ), + 'scircumflex' => array ( 0x015D, 0x015D, 0x015D, 0x015D ), + 'Scommaaccent' => array ( 0x0218, 0x0218, 0x0218, 0x0218 ), + 'scommaaccent' => array ( 0x0219, 0x0219, 0x0219, 0x0219 ), + 'Tbar' => array ( 0x1E6E, 0x1E6E, 0x1E6E, 0x1E6E ), + 'tbar' => array ( 0x1E6F, 0x1E6F, 0x1E6F, 0x1E6F ), + 'Tcaron' => array ( 0x0164, 0x0164, 0x0164, 0x0164 ), + 'tcaron' => array ( 0x0165, 0x0165, 0x0165, 0x0165 ), + 'Tcedilla' => array ( 0x0162, 0x0162, 0x0162, 0x0162 ), + 'tcedilla' => array ( 0x0163, 0x0163, 0x0163, 0x0163 ), + 'Tcommaaccent' => array ( 0x0162, 0x0162, 0x0162, 0x0162 ), + 'tcommaaccent' => array ( 0x0163, 0x0163, 0x0163, 0x0163 ), + 'tildecomb' => array ( 0x0303, 0x0303, 0x0303, 0x0303 ), + 'Ubreve' => array ( 0x016C, 0x016C, 0x016C, 0x016C ), + 'ubreve' => array ( 0x016D, 0x016D, 0x016D, 0x016D ), + 'Ucaron' => array ( 0x01D3, 0x01D3, 0x01D3, 0x01D3 ), + 'uCaron' => array ( 0x01D4, 0x01D4, 0x01D4, 0x01D4 ), + 'Udblacute' => array ( 0x0170, 0x0170, 0x0170, 0x0170 ), + 'udblacute' => array ( 0x0171, 0x0171, 0x0171, 0x0171 ), + 'Udieresis' => array ( 0x00DC, 0x00DC, 0x00DC, 0x00DC ), + 'udieresis' => array ( 0x00FC, 0x00FC, 0x00FC, 0x00FC ), + 'Udotbelow' => array ( 0x1EE4, 0x1EE4, 0x1EE4, 0x1EE4 ), + 'udotbelow' => array ( 0x1EE5, 0x1EE5, 0x1EE5, 0x1EE5 ), + 'Uhookabove' => array ( 0x1EE6, 0x1EE6, 0x1EE6, 0x1EE6 ), + 'uhookabove' => array ( 0x1EE7, 0x1EE7, 0x1EE7, 0x1EE7 ), + 'Uhorn' => array ( 0x01AF, 0x01AF, 0x01AF, 0x01AF ), + 'uhorn' => array ( 0x01B0, 0x01B0, 0x01B0, 0x01B0 ), + 'Uhornacute' => array ( 0x1EE8, 0x1EE8, 0x1EE8, 0x1EE8 ), + 'uhornacute' => array ( 0x1EE9, 0x1EE9, 0x1EE9, 0x1EE9 ), + 'Uhorndotbelow' => array ( 0x1EF0, 0x1EF0, 0x1EF0, 0x1EF0 ), + 'uhorndotbelow' => array ( 0x1EF1, 0x1EF1, 0x1EF1, 0x1EF1 ), + 'Uhorngrave' => array ( 0x1EEA, 0x1EEA, 0x1EEA, 0x1EEA ), + 'uhorngrave' => array ( 0x1EEB, 0x1EEB, 0x1EEB, 0x1EEB ), + 'Uhornhookabove' => array ( 0x1EEC, 0x1EEC, 0x1EEC, 0x1EEC ), + 'uhornhookabove' => array ( 0x1EED, 0x1EED, 0x1EED, 0x1EED ), + 'Uhorntilde' => array ( 0x1EEE, 0x1EEE, 0x1EEE, 0x1EEE ), + 'uhorntilde' => array ( 0x1EEF, 0x1EEF, 0x1EEF, 0x1EEF ), + 'Uhungarumlaut' => array ( 0x0170, 0x0170, 0x0170, 0x0170 ), + 'uhungarumlaut' => array ( 0x0171, 0x0171, 0x0171, 0x0171 ), + 'Umacron' => array ( 0x016A, 0x016A, 0x016A, 0x016A ), + 'umacron' => array ( 0x016B, 0x016B, 0x016B, 0x016B ), + 'Uogonek' => array ( 0x0172, 0x0172, 0x0172, 0x0172 ), + 'uogonek' => array ( 0x0173, 0x0173, 0x0173, 0x0173 ), + 'Uring' => array ( 0x016E, 0x016E, 0x016E, 0x016E ), + 'uring' => array ( 0x016F, 0x016F, 0x016F, 0x016F ), + 'upsilondieresis' => array ( 0x00FF, 0x00FF, 0x00FF, 0x00FF ), + 'Upsilondieresis' => array ( 0x0178, 0x0178, 0x0178, 0x0178 ), + 'Utilde' => array ( 0x0168, 0x0168, 0x0168, 0x0168 ), + 'utilde' => array ( 0x0169, 0x0169, 0x0169, 0x0169 ), + 'Wacute' => array ( 0x1E82, 0x1E82, 0x1E82, 0x1E82 ), + 'wacute' => array ( 0x1E83, 0x1E83, 0x1E83, 0x1E83 ), + 'Wcircumflex' => array ( 0x0174, 0x0174, 0x0174, 0x0174 ), + 'wcircumflex' => array ( 0x0175, 0x0175, 0x0175, 0x0175 ), + 'Wdieresis' => array ( 0x1E84, 0x1E84, 0x1E84, 0x1E84 ), + 'wdieresis' => array ( 0x1E8E, 0x1E8E, 0x1E8E, 0x1E8E ), + 'Wgrave' => array ( 0x00C0, 0x00C0, 0x00C0, 0x00C0 ), + 'wgrave' => array ( 0x00E0, 0x00E0, 0x00E0, 0x00E0 ), + 'Yacute' => array ( 0x00DD, 0x00DD, 0x00DD, 0x00DD ), + 'yacute' => array ( 0x00DE, 0x00DE, 0x00DE, 0x00DE ), + 'Ycircumflex' => array ( 0x0176, 0x0176, 0x0176, 0x0176 ), + 'ycircumflex' => array ( 0x0177, 0x0177, 0x0177, 0x0177 ), + 'Ydieresis' => array ( 0x0178, 0x0178, 0x0178, 0x0178 ), + 'ydieresis' => array ( 0x00FF, 0x00FF, 0x00FF, 0x00FF ), + 'Ydotbelow' => array ( 0x1EF4, 0x1EF4, 0x1EF4, 0x1EF4 ), + 'ydotbelow' => array ( 0x1EF5, 0x1EF5, 0x1EF5, 0x1EF5 ), + 'Ygrave' => array ( 0x1EF2, 0x1EF2, 0x1EF2, 0x1EF2 ), + 'ygrave' => array ( 0x1EF3, 0x1EF3, 0x1EF3, 0x1EF3 ), + 'Yhookabove' => array ( 0x1EF6, 0x1EF6, 0x1EF6, 0x1EF6 ), + 'yhookabove' => array ( 0x1EF7, 0x1EF7, 0x1EF7, 0x1EF7 ), + 'Ytilde' => array ( 0x1EF8, 0x1EF8, 0x1EF8, 0x1EF8 ), + 'ytilde' => array ( 0x1EF9, 0x1EF9, 0x1EF9, 0x1EF9 ), + 'Zacute' => array ( 0x0179, 0x0179, 0x0179, 0x0179 ), + 'zacute' => array ( 0x017A, 0x017A, 0x017A, 0x017A ), + 'Zcaron' => array ( 0x017E, 0x017E, 0x017E, 0x017E ), + 'zcaron' => array ( 0x017D, 0x017D, 0x017D, 0x017D ), + 'Zcircumflex' => array ( 0x1E90, 0x1E90, 0x1E90, 0x1E90 ), + 'zcircumflex' => array ( 0x1E91, 0x1E91, 0x1E91, 0x1E91 ), + 'zdot' => array ( 0x017C, 0x017C, 0x017C, 0x017C ), + 'Zdot' => array ( 0x017B, 0x017B, 0x017B, 0x017B ), + 'zdotaccent' => array ( 0x017C, 0x017C, 0x017C, 0x017C ), + 'Zdotaccent' => array ( 0x017B, 0x017B, 0x017B, 0x017B ), + + // Special symbols + 'approxequal' => array ( 0x2248, 0x2248, 0x2248, 0x2248 ), + 'arrowleft' => array ( 0x2190, 0x2190, 0x2190, 0x2190 ), + 'arrowright' => array ( 0x2192, 0x2192, 0x2192, 0x2192 ), + 'block' => array ( 0x2588, 0x2588, 0x2588, 0x2588 ), + 'circle' => array ( 0x25CB, 0x25CB, 0x25CB, 0x25CB ), + 'club' => array ( 0x2663, 0x2663, 0x2663, 0x2663 ), + 'commaaccent' => array ( 0x002C, 0x002C, 0x002C, 0x002C ), + 'congruent' => array ( 0x2261, 0x2261, 0x2261, 0x2261 ), + 'dkshade' => array ( 0x2593, 0x2593, 0x2593, 0x2593 ), + 'dnblock' => array ( 0x2584, 0x2584, 0x2584, 0x2584 ), + 'eightsuperior' => array ( 0x2078, 0x2663, 0x2663, 0x2663 ), + 'emptyset' => array ( 0x2205, 0x2205, 0x2205, 0x2205 ), + 'equivalence' => array ( 0x2261, 0x2261, 0x2261, 0x2261 ), + 'estimated' => array ( 0x212E, 0x212E, 0x212E, 0x212E ), + 'exclamdbl' => array ( 0x203C, 0x203C, 0x203C, 0x203C ), + 'female' => array ( 0x2640, 0x2640, 0x2640, 0x2640 ), + 'filledbox' => array ( 0x25A0, 0x25A0, 0x25A0, 0x25A0 ), + 'filledrect' => array ( 0x25AC, 0x25AC, 0x25AC, 0x25AC ), + 'fiveeighths' => array ( 0x251D, 0x251D, 0x251D, 0x251D ), + 'fivesuperior' => array ( 0x2075, 0x2075, 0x2075, 0x2075 ), + 'foursuperior' => array ( 0x2074, 0x2074, 0x2074, 0x2074 ), + 'four.superior' => array ( 0x2074, 0x2074, 0x2074, 0x2074 ), + 'franc' => array ( 0x20A3, 0x20A3, 0x20A3, 0x20A3 ), + 'greaterequal' => array ( 0x2265, 0x2265, 0x2265, 0x2265 ), + 'heart' => array ( 0x2665, 0x2665, 0x2665, 0x2665 ), + 'house' => array ( 0x2302, 0x2302, 0x2302, 0x2302 ), + 'increment' => array ( 0x2206, 0x2206, 0x2206, 0x2206 ), + 'infinity' => array ( 0x221E, 0x221E, 0x221E, 0x221E ), + 'integral' => array ( 0x222B, 0x222B, 0x222B, 0x222B ), + 'integralbt' => array ( 0x2321, 0x2321, 0x2321, 0x2321 ), + 'integraltp' => array ( 0x2320, 0x2320, 0x2320, 0x2320 ), + 'intersection' => array ( 0x2229, 0x2229, 0x2229, 0x2229 ), + 'invbullet' => array ( 0x25D8, 0x25D8, 0x25D8, 0x25D8 ), + 'invcircle' => array ( 0x25D9, 0x25D9, 0x25D9, 0x25D9 ), + 'invsmileface' => array ( 0x263B, 0x263B, 0x263B, 0x263B ), + 'lessequal' => array ( 0x2264, 0x2264, 0x2264, 0x2264 ), + 'lfblock' => array ( 0x258C, 0x258C, 0x258C, 0x258C ), + 'lira' => array ( 0x20A4, 0x20A4, 0x20A4, 0x20A4 ), + 'ltshade' => array ( 0x2591, 0x2591, 0x2591, 0x2591 ), + 'longs' => array ( 0x017F, 0x017F, 0x017F, 0x017F ), + 'male' => array ( 0x2642, 0x2642, 0x2642, 0x2642 ), + 'middot' => array ( 0x00B7, 0x00B7, 0x00B7, 0x00B7 ), + 'minute' => array ( 0x2032, 0x2032, 0x2032, 0x2032 ), + 'musicalnote' => array ( 0x266A, 0x266A, 0x266A, 0x266A ), + 'musicalnotedbl' => array ( 0x266B, 0x266B, 0x266B, 0x266B ), + 'ninesuperior' => array ( 0x2079, 0x2079, 0x2079, 0x2079 ), + 'notequal' => array ( 0x2260, 0x2260, 0x2260, 0x2260 ), + 'nsuperior' => array ( 0x207F, 0x207F, 0x207F, 0x207F ), + 'Ohm' => array ( 0x2126, 0x2126, 0x2126, 0x2126 ), + 'ohm' => array ( 0x03C9, 0x03C9, 0x03C9, 0x03C9 ), + 'oneeighth' => array ( 0x215B, 0x215B, 0x215B, 0x215B ), + 'onesuperior' => array ( 0x2071, 0x2071, 0x2071, 0x2071 ), + 'one.superior' => array ( 0x2071, 0x2071, 0x2071, 0x2071 ), + 'onethird' => array ( 0x2153, 0x2153, 0x2153, 0x2153 ), + 'orthogonal' => array ( 0x221F, 0x221F, 0x221F, 0x221F ), + 'parenleftbt' => array ( 0x0028, 0x0028, 0x0028, 0x0028 ), + 'parenleftex' => array ( 0x0028, 0x0028, 0x0028, 0x0028 ), + 'parenlefttp' => array ( 0x0028, 0x0028, 0x0028, 0x0028 ), + 'parenrightbt' => array ( 0x0029, 0x0029, 0x0029, 0x0029 ), + 'parenrightex' => array ( 0x0029, 0x0029, 0x0029, 0x0029 ), + 'parenrighttp' => array ( 0x0029, 0x0029, 0x0029, 0x0029 ), + 'partialdiff' => array ( 0x2202, 0x2202, 0x2202, 0x2202 ), + 'peseta' => array ( 0x20A7, 0x20A7, 0x20A7, 0x20A7 ), + 'product' => array ( 0x220F, 0x220F, 0x220F, 0x220F ), + 'quotereversed' => array ( 0x201B, 0x201B, 0x201B, 0x201B ), + 'radical' => array ( 0x23B7, 0x23B7, 0x23B7, 0x23B7 ), + 'radicalex' => array ( 0x203E, 0x203E, 0x203E, 0x203E ), + 'revlogicalnot' => array ( 0x2310, 0x2310, 0x2310, 0x2310 ), + 'rtblock' => array ( 0x2590, 0x2590, 0x2590, 0x2590 ), + 'second' => array ( 0x2033, 0x2033, 0x2033, 0x2033 ), + 'seveneighths' => array ( 0x215E, 0x215E, 0x215E, 0x215E ), + 'sevensuperior' => array ( 0x2077, 0x2077, 0x2077, 0x2077 ), + 'shade' => array ( 0x2592, 0x2592, 0x2592, 0x2592 ), + 'similar' => array ( 0x2242, 0x2242, 0x2242, 0x2242 ), + 'smileface' => array ( 0x263A, 0x263A, 0x263A, 0x263A ), + 'sixsuperior' => array ( 0x2076, 0x2076, 0x2076, 0x2076 ), + 'spade' => array ( 0x2660, 0x2660, 0x2660, 0x2660 ), + 'summation' => array ( 0x2211, 0x2211, 0x2211, 0x2211 ), + 'sun' => array ( 0x263C, 0x263C, 0x263C, 0x263C ), + 'threeeighths' => array ( 0x215C, 0x215C, 0x215C, 0x215C ), + 'threesuperior' => array ( 0x00B3, 0x00B3, 0x00B3, 0x00B3 ), + 'three.superior' => array ( 0x00B3, 0x00B3, 0x00B3, 0x00B3 ), + 'triagdn' => array ( 0x25BC, 0x25BC, 0x25BC, 0x25BC ), + 'triaglf' => array ( 0x25C4, 0x25C4, 0x25C4, 0x25C4 ), + 'triagrt' => array ( 0x25BA, 0x25BA, 0x25BA, 0x25BA ), + 'triagup' => array ( 0x25B2, 0x25B2, 0x25B2, 0x25B2 ), + 'twosuperior' => array ( 0x00B2, 0x00B2, 0x00B2, 0x00B2 ), + 'two.superior' => array ( 0x00B2, 0x00B2, 0x00B2, 0x00B2 ), + 'twothirds' => array ( 0x2154, 0x2154, 0x2154, 0x2154 ), + 'undercommaaccent' => array ( 0x0326, 0x0326, 0x0326, 0x0326 ), + 'underscoredbl' => array ( 0x005F, 0x005F, 0x005F, 0x005F ), + 'upblock' => array ( 0x2580, 0x2580, 0x2580, 0x2580 ), + 'zerosuperior' => array ( 0x2070, 0x2070, 0x2070, 0x2070 ), + + // Greek characters + 'Alpha' => array ( 0x0391, 0x0391, 0x0391, 0x0391 ), + 'alpha' => array ( 0x03B1, 0x03B1, 0x03B1, 0x03B1 ), + 'Alphatonos' => array ( 0x0386, 0x0386, 0x0386, 0x0386 ), + 'alphatonos' => array ( 0x03AC, 0x03AC, 0x03AC, 0x03AC ), + 'anoteleia' => array ( 0x0387, 0x0387, 0x0387, 0x0387 ), + 'Beta' => array ( 0x0392, 0x0392, 0x0392, 0x0392 ), + 'beta' => array ( 0x03B2, 0x03B2, 0x03B2, 0x03B2 ), + 'Gamma' => array ( 0x0393, 0x0393, 0x0393, 0x0393 ), + 'gamma' => array ( 0x03B3, 0x03B3, 0x03B3, 0x03B3 ), + 'Delta' => array ( 0x0394, 0x0394, 0x0394, 0x0394 ), + 'Deltagreek' => array ( 0x0394, 0x0394, 0x0394, 0x0394 ), + 'delta' => array ( 0x03B4, 0x03B4, 0x03B4, 0x03B4 ), + 'dieresistonos' => array ( 0x0385, 0x0385, 0x0385, 0x0385 ), + 'Epsilon' => array ( 0x0395, 0x0395, 0x0395, 0x0395 ), + 'epsilon' => array ( 0x03B5, 0x03B5, 0x03B5, 0x03B5 ), + 'Epsilontonos' => array ( 0x0388, 0x0388, 0x0388, 0x0388 ), + 'epsilontonos' => array ( 0x03AD, 0x03AD, 0x03AD, 0x03AD ), + 'Etatonos' => array ( 0x0389, 0x0389, 0x0389, 0x0389 ), + 'etatonos' => array ( 0x03AD, 0x03AD, 0x03AD, 0x03AD ), + 'Zeta' => array ( 0x0396, 0x0396, 0x0396, 0x0396 ), + 'zeta' => array ( 0x03B6, 0x03B6, 0x03B6, 0x03B6 ), + 'Eta' => array ( 0x0397, 0x0397, 0x0397, 0x0397 ), + 'eta' => array ( 0x03B7, 0x03B7, 0x03B7, 0x03B7 ), + 'Theta' => array ( 0x0398, 0x0398, 0x0398, 0x0398 ), + 'theta' => array ( 0x03B8, 0x03B8, 0x03B8, 0x03B8 ), + 'Iota' => array ( 0x0399, 0x0399, 0x0399, 0x0399 ), + 'Iotadieresis' => array ( 0x03AA, 0x03AA, 0x03AA, 0x03AA ), + 'iotadieresis' => array ( 0x03CA, 0x03CA, 0x03CA, 0x03CA ), + 'iota' => array ( 0x03B9, 0x03B9, 0x03B9, 0x03B9 ), + 'iotadieresistonos' => array ( 0x0390, 0x0390, 0x0390, 0x0390 ), + 'Iotatonos' => array ( 0x038A, 0x038A, 0x038A, 0x038A ), + 'iotatonos' => array ( 0x03AF, 0x03AF, 0x03AF, 0x03AF ), + 'Kappa' => array ( 0x039A, 0x039A, 0x039A, 0x039A ), + 'kappa' => array ( 0x03BA, 0x03BA, 0x03BA, 0x03BA ), + 'Lambda' => array ( 0x039B, 0x039B, 0x039B, 0x039B ), + 'lambda' => array ( 0x03BB, 0x03BB, 0x03BB, 0x03BB ), + 'Mu' => array ( 0x039C, 0x039C, 0x039C, 0x039C ), + 'mu' => array ( 0x03BC, 0x03BC, 0x03BC, 0x03BC ), + 'Mu1' => array ( 0x039C, 0x039C, 0x039C, 0x039C ), + 'mu1' => array ( 0x03BC, 0x03BC, 0x03BC, 0x03BC ), + 'Nu' => array ( 0x039D, 0x039D, 0x039D, 0x039D ), + 'nu' => array ( 0x03BD, 0x03BD, 0x03BD, 0x03BD ), + 'Xi' => array ( 0x039E, 0x039E, 0x039E, 0x039E ), + 'xi' => array ( 0x03BE, 0x03BE, 0x03BE, 0x03BE ), + 'Omicron' => array ( 0x039F, 0x039F, 0x039F, 0x039F ), + 'omicron' => array ( 0x03BF, 0x03BF, 0x03BF, 0x03BF ), + 'Omicrontonos' => array ( 0x038C, 0x038C, 0x038C, 0x038C ), + 'omicrontonos' => array ( 0x03CC, 0x03CC, 0x03CC, 0x03CC ), + 'Pi' => array ( 0x03A0, 0x03A0, 0x03A0, 0x03A0 ), + 'pi' => array ( 0x03C0, 0x03C0, 0x03C0, 0x03C0 ), + 'Rho' => array ( 0x03A1, 0x03A1, 0x03A1, 0x03A1 ), + 'rho' => array ( 0x03C1, 0x03C1, 0x03C1, 0x03C1 ), + 'Sigma' => array ( 0x03A3, 0x03A3, 0x03A3, 0x03A3 ), + 'sigma' => array ( 0x03C3, 0x03C3, 0x03C3, 0x03C3 ), + 'Sigma1' => array ( 0x03A2, 0x03A2, 0x03A2, 0x03A2 ), + 'sigma1' => array ( 0x03C2, 0x03C2, 0x03C2, 0x03C2 ), + 'Tau' => array ( 0x03A4, 0x03A4, 0x03A4, 0x03A4 ), + 'tonos' => array ( 0x0384, 0x0384, 0x0384, 0x0384 ), + 'tau' => array ( 0x03C4, 0x03C4, 0x03C4, 0x03C4 ), + 'Upsilon' => array ( 0x03A5, 0x03A5, 0x03A5, 0x03A5 ), + 'upsilon' => array ( 0x03C5, 0x03C5, 0x03C5, 0x03C5 ), + 'Upsilondieresis' => array ( 0x03AB, 0x03AB, 0x03AB, 0x03AB ), + 'upsilondieresis' => array ( 0x03CB, 0x03CB, 0x03CB, 0x03CB ), + 'Upsilontonos' => array ( 0x038E, 0x038E, 0x038E, 0x038E ), + 'upsilontonos' => array ( 0x03CD, 0x03CD, 0x03CD, 0x03CD ), + 'upsilondieresistonos' => array ( 0x03B0, 0x03B0, 0x03B0, 0x03B0 ), + 'Phi' => array ( 0x03A6, 0x03A6, 0x03A6, 0x03A6 ), + 'phi' => array ( 0x03C6, 0x03C6, 0x03C6, 0x03C6 ), + 'Chi' => array ( 0x03A7, 0x03A7, 0x03A7, 0x03A7 ), + 'chi' => array ( 0x03C7, 0x03C7, 0x03C7, 0x03C7 ), + 'Psi' => array ( 0x03A8, 0x03A8, 0x03A8, 0x03A8 ), + 'psi' => array ( 0x03C8, 0x03C8, 0x03C8, 0x03C8 ), + 'Omega' => array ( 0x03A9, 0x03A9, 0x03A9, 0x03A9 ), + 'omega' => array ( 0x03C9, 0x03C9, 0x03C9, 0x03C9 ), + 'Omegatonos' => array ( 0x038F, 0x038F, 0x038F, 0x038F ), + 'omegatonos' => array ( 0x03CE, 0x03CE, 0x03CE, 0x03CE ), + + // http://www.tipometar.org/pojmovnik/Hint/img/Using%20Fontographer.pdf + // ftp://ftp.software.ibm.com/software/globalization/gcoc/attachments/CP00437.pdf + // http://jrgraphix.net/r/Unicode/2500-257F + // http://www.alanwood.net/demos/wingdings.html + // Almost everything is in the links ; the table blow needs to be completed, though + '.notdef' => array ( 0x0020, 0x0020, 0x0020, 0x0020 ), // Undefined (?) + 'afii00208' => array ( 0x002D, 0x002D, 0x002D, 0x002D ), // Minus + 'afii08941' => array ( 0x204A, 0x204A, 0x204A, 0x204A ), // Pound + 'afii10017' => array ( 0x0410, 0x0410, 0x0410, 0x0410 ), + 'afii10018' => array ( 0x0411, 0x0411, 0x0411, 0x0411 ), + 'afii10019' => array ( 0x0412, 0x0412, 0x0412, 0x0412 ), + 'afii10020' => array ( 0x0413, 0x0413, 0x0413, 0x0413 ), + 'afii10021' => array ( 0x0414, 0x0414, 0x0414, 0x0414 ), + 'afii10022' => array ( 0x0415, 0x0415, 0x0415, 0x0415 ), + 'afii10023' => array ( 0x0401, 0x0401, 0x0401, 0x0401 ), + 'afii10024' => array ( 0x0416, 0x0416, 0x0416, 0x0416 ), + 'afii10025' => array ( 0x0417, 0x0417, 0x0417, 0x0417 ), + 'afii10026' => array ( 0x0418, 0x0418, 0x0418, 0x0418 ), + 'afii10027' => array ( 0x0419, 0x0419, 0x0419, 0x0419 ), + 'afii10028' => array ( 0x041a, 0x041a, 0x041a, 0x041a ), + 'afii10029' => array ( 0x041b, 0x041b, 0x041b, 0x041b ), + 'afii10030' => array ( 0x041c, 0x041c, 0x041c, 0x041c ), + 'afii10031' => array ( 0x041d, 0x041d, 0x041d, 0x041d ), + 'afii10032' => array ( 0x041e, 0x041e, 0x041e, 0x041e ), + 'afii10033' => array ( 0x041f, 0x041f, 0x041f, 0x041f ), + 'afii10034' => array ( 0x0420, 0x0420, 0x0420, 0x0420 ), + 'afii10035' => array ( 0x0421, 0x0421, 0x0421, 0x0421 ), + 'afii10036' => array ( 0x0422, 0x0422, 0x0422, 0x0422 ), + 'afii10037' => array ( 0x0423, 0x0423, 0x0423, 0x0423 ), + 'afii10038' => array ( 0x0424, 0x0424, 0x0424, 0x0424 ), + 'afii10039' => array ( 0x0425, 0x0425, 0x0425, 0x0425 ), + 'afii10040' => array ( 0x0426, 0x0426, 0x0426, 0x0426 ), + 'afii10041' => array ( 0x0427, 0x0427, 0x0427, 0x0427 ), + 'afii10042' => array ( 0x0428, 0x0428, 0x0428, 0x0428 ), + 'afii10043' => array ( 0x0429, 0x0429, 0x0429, 0x0429 ), + 'afii10044' => array ( 0x042a, 0x042a, 0x042a, 0x042a ), + 'afii10045' => array ( 0x042b, 0x042b, 0x042b, 0x042b ), + 'afii10046' => array ( 0x042c, 0x042c, 0x042c, 0x042c ), + 'afii10047' => array ( 0x042d, 0x042d, 0x042d, 0x042d ), + 'afii10048' => array ( 0x042e, 0x042e, 0x042e, 0x042e ), + 'afii10049' => array ( 0x042f, 0x042f, 0x042f, 0x042f ), + 'afii10050' => array ( 0x0490, 0x0490, 0x0490, 0x0490 ), + 'afii10051' => array ( 0x0402, 0x0402, 0x0402, 0x0402 ), + 'afii10052' => array ( 0x0403, 0x0403, 0x0403, 0x0403 ), + 'afii10053' => array ( 0x0404, 0x0404, 0x0404, 0x0404 ), + 'afii10054' => array ( 0x0405, 0x0405, 0x0405, 0x0405 ), + 'afii10055' => array ( 0x0406, 0x0406, 0x0406, 0x0406 ), + 'afii10056' => array ( 0x0407, 0x0407, 0x0407, 0x0407 ), + 'afii10057' => array ( 0x0408, 0x0408, 0x0408, 0x0408 ), + 'afii10058' => array ( 0x0409, 0x0409, 0x0409, 0x0409 ), + 'afii10059' => array ( 0x040a, 0x040a, 0x040a, 0x040a ), + 'afii10060' => array ( 0x040b, 0x040b, 0x040b, 0x040b ), + 'afii10061' => array ( 0x040c, 0x040c, 0x040c, 0x040c ), + 'afii10062' => array ( 0x040e, 0x040e, 0x040e, 0x040e ), + 'afii10065' => array ( 0x0430, 0x0430, 0x0430, 0x0430 ), + 'afii10066' => array ( 0x0431, 0x0431, 0x0431, 0x0431 ), + 'afii10067' => array ( 0x0432, 0x0432, 0x0432, 0x0432 ), + 'afii10068' => array ( 0x0433, 0x0433, 0x0433, 0x0433 ), + 'afii10069' => array ( 0x0434, 0x0434, 0x0434, 0x0434 ), + 'afii10070' => array ( 0x0435, 0x0435, 0x0435, 0x0435 ), + 'afii10071' => array ( 0x0436, 0x0436, 0x0436, 0x0436 ), + 'afii10072' => array ( 0x0437, 0x0437, 0x0437, 0x0437 ), + 'afii10073' => array ( 0x0438, 0x0438, 0x0438, 0x0438 ), + 'afii10074' => array ( 0x0439, 0x0439, 0x0439, 0x0439 ), + 'afii10075' => array ( 0x043a, 0x043a, 0x043a, 0x043a ), + 'afii10076' => array ( 0x043b, 0x043b, 0x043b, 0x043b ), + 'afii10077' => array ( 0x043c, 0x043c, 0x043c, 0x043c ), + 'afii10078' => array ( 0x043d, 0x043d, 0x043d, 0x043d ), + 'afii10079' => array ( 0x043e, 0x043e, 0x043e, 0x043e ), + 'afii10080' => array ( 0x043f, 0x043f, 0x043f, 0x043f ), + 'afii10081' => array ( 0x0440, 0x0440, 0x0440, 0x0440 ), + 'afii10082' => array ( 0x0441, 0x0441, 0x0441, 0x0441 ), + 'afii10083' => array ( 0x0442, 0x0442, 0x0442, 0x0442 ), + 'afii10084' => array ( 0x0443, 0x0443, 0x0443, 0x0443 ), + 'afii10085' => array ( 0x0444, 0x0444, 0x0444, 0x0444 ), + 'afii10086' => array ( 0x0445, 0x0445, 0x0445, 0x0445 ), + 'afii10087' => array ( 0x0446, 0x0446, 0x0446, 0x0446 ), + 'afii10088' => array ( 0x0447, 0x0447, 0x0447, 0x0447 ), + 'afii10089' => array ( 0x0448, 0x0448, 0x0448, 0x0448 ), + 'afii10090' => array ( 0x0449, 0x0449, 0x0449, 0x0449 ), + 'afii10091' => array ( 0x044a, 0x044a, 0x044a, 0x044a ), + 'afii10092' => array ( 0x044b, 0x044b, 0x044b, 0x044b ), + 'afii10093' => array ( 0x044c, 0x044c, 0x044c, 0x044c ), + 'afii10094' => array ( 0x044d, 0x044d, 0x044d, 0x044d ), + 'afii10095' => array ( 0x044e, 0x044e, 0x044e, 0x044e ), + 'afii10096' => array ( 0x044f, 0x044f, 0x044f, 0x044f ), + 'afii10097' => array ( 0x0450, 0x0450, 0x0450, 0x0450 ), + 'afii10098' => array ( 0x0451, 0x0451, 0x0451, 0x0451 ), + 'afii10099' => array ( 0x0452, 0x0452, 0x0452, 0x0452 ), + 'afii10100' => array ( 0x0453, 0x0453, 0x0453, 0x0453 ), + 'afii10101' => array ( 0x0454, 0x0454, 0x0454, 0x0454 ), + 'afii10102' => array ( 0x0455, 0x0455, 0x0455, 0x0455 ), + 'afii10103' => array ( 0x0456, 0x0456, 0x0456, 0x0456 ), + 'afii10104' => array ( 0x0457, 0x0457, 0x0457, 0x0457 ), + 'afii10105' => array ( 0x0458, 0x0458, 0x0458, 0x0458 ), + 'afii10106' => array ( 0x0459, 0x0459, 0x0459, 0x0459 ), + 'afii10107' => array ( 0x045a, 0x045a, 0x045a, 0x045a ), + 'afii10108' => array ( 0x045b, 0x045b, 0x045b, 0x045b ), + 'afii10109' => array ( 0x045c, 0x045c, 0x045c, 0x045c ), + 'afii10110' => array ( 0x045E, 0x045E, 0x045E, 0x045E ), + 'afii10145' => array ( 0x040F, 0x040F, 0x040F, 0x040F ), + 'afii10193' => array ( 0x045F, 0x045F, 0x045F, 0x045F ), + 'afii61248' => array ( 0x2105, 0x2105, 0x2105, 0x2105 ), // English symbol "care of" + 'afii61289' => array ( 0x2113, 0x2113, 0x2113, 0x2113 ), // Lower "l de ronde" + 'afii61352' => array ( 0x2116, 0x2116, 0x2116, 0x2116 ), + 'H18543' => array ( 0x25A0, 0x25A0, 0x25A0, 0x25A0 ), // Black square + 'H18533' => array ( 0x25CF, 0x25CF, 0x25CF, 0x25CF ), // Black circle + 'H22073' => array ( 0x25A1, 0x25A1, 0x25A1, 0x25A1 ), // White square + 'H18551' => array ( 0x25AB, 0x25AB, 0x25AB, 0x25AB ), // White square with double horizontal borders + 'SF070000' => array ( 0x2534, 0x2534, 0x2534, 0x2534 ), // Semi-graphic + 'SF010000' => array ( 0x250C, 0x250C, 0x250C, 0x250C ), + 'SF020000' => array ( 0x2514, 0x2514, 0x2514, 0x2514 ), + 'SF030000' => array ( 0x2510, 0x2510, 0x2510, 0x2510 ), + 'SF040000' => array ( 0x2518, 0x2518, 0x2518, 0x2518 ), + 'SF050000' => array ( 0x253C, 0x253C, 0x253C, 0x253C ), + 'SF060000' => array ( 0x252C, 0x252C, 0x252C, 0x252C ), + 'SF070000' => array ( 0x2534, 0x2534, 0x2534, 0x2534 ), + 'SF080000' => array ( 0x251C, 0x251C, 0x251C, 0x251C ), + 'SF090000' => array ( 0x2524, 0x2524, 0x2524, 0x2524 ), + 'SF100000' => array ( 0x2501, 0x2501, 0x2501, 0x2501 ), + 'SF110000' => array ( 0x2502, 0x2502, 0x2502, 0x2502 ), + 'SF190000' => array ( 0x2561, 0x2561, 0x2561, 0x2561 ), + 'SF200000' => array ( 0x2562, 0x2562, 0x2562, 0x2562 ), + 'SF210000' => array ( 0x2556, 0x2556, 0x2556, 0x2556 ), + 'SF220000' => array ( 0x2555, 0x2555, 0x2555, 0x2555 ), + 'SF230000' => array ( 0x2563, 0x2563, 0x2563, 0x2563 ), + 'SF240000' => array ( 0x2551, 0x2551, 0x2551, 0x2551 ), + 'SF250000' => array ( 0x2557, 0x2557, 0x2557, 0x2557 ), + 'SF260000' => array ( 0x255D, 0x255D, 0x255D, 0x255D ), + 'SF270000' => array ( 0x255C, 0x255C, 0x255C, 0x255C ), + 'SF280000' => array ( 0x255B, 0x255B, 0x255B, 0x255B ), + 'SF360000' => array ( 0x255E, 0x255E, 0x255E, 0x255E ), + 'SF370000' => array ( 0x255F, 0x255F, 0x255F, 0x255F ), + 'SF380000' => array ( 0x255F, 0x255F, 0x255F, 0x255F ), + 'SF390000' => array ( 0x2554, 0x2554, 0x2554, 0x2554 ), + 'SF400000' => array ( 0x2569, 0x2569, 0x2569, 0x2569 ), + 'SF410000' => array ( 0x2566, 0x2566, 0x2566, 0x2566 ), + 'SF420000' => array ( 0x2560, 0x2560, 0x2560, 0x2560 ), + 'SF430000' => array ( 0x2550, 0x2550, 0x2550, 0x2550 ), + 'SF440000' => array ( 0x256C, 0x256C, 0x256C, 0x256C ), + 'SF450000' => array ( 0x2567, 0x2567, 0x2567, 0x2567 ), + 'SF460000' => array ( 0x2568, 0x2568, 0x2568, 0x2568 ), + 'SF470000' => array ( 0x2564, 0x2564, 0x2564, 0x2564 ), + 'SF480000' => array ( 0x2565, 0x2565, 0x2565, 0x2565 ), + 'SF490000' => array ( 0x2559, 0x2559, 0x2559, 0x2559 ), + 'SF500000' => array ( 0x2558, 0x2558, 0x2558, 0x2558 ), + 'SF510000' => array ( 0x2552, 0x2552, 0x2552, 0x2552 ), + 'SF520000' => array ( 0x2553, 0x2553, 0x2553, 0x2553 ), + 'SF530000' => array ( 0x256B, 0x256B, 0x256B, 0x256B ), + 'SF540000' => array ( 0x256A, 0x256A, 0x256A, 0x256A ), + + // Wingdings + 'arrowboth' => array ( 0x2194, 0x2194, 0x2194, 0x2194 ), + 'arrowdown' => array ( 0x2193, 0x2193, 0x2193, 0x2193 ), + 'arrowleft' => array ( 0x2190, 0x2190, 0x2190, 0x2190 ), + 'arrowright' => array ( 0x2192, 0x2192, 0x2192, 0x2192 ), + 'arrowup' => array ( 0x2191, 0x2191, 0x2191, 0x2191 ), + 'arrowupdn' => array ( 0x2195, 0x2195, 0x2195, 0x2195 ), + 'arrowupdnbse' => array ( 0x21A8, 0x21A8, 0x21A8, 0x21A8 ), + 'barb2left' => array ( 0x1F868, 0x1F868, 0x1F868, 0x1F868 ), // Wide-headed leftwards barb arrow + 'barb2right' => array ( 0x1F86A, 0x1F86A, 0x1F86A, 0x1F86A ), // Wide-headed rightwards barb arrow + 'barb2up' => array ( 0x1F869, 0x1F869, 0x1F869, 0x1F869 ), // Wide-headed upwards barb arrow + 'barb2down' => array ( 0x1F86B, 0x1F86B, 0x1F86B, 0x1F86B ), // Wide-headed downwards barb arrow + 'barb2nw' => array ( 0x1F86C, 0x1F86C, 0x1F86C, 0x1F86C ), // Wide-headed north west barb arrow + 'barb2ne' => array ( 0x1F86D, 0x1F86D, 0x1F86D, 0x1F86D ), // Wide-headed north east barb arrow + 'barb2sw' => array ( 0x1F86F, 0x1F86F, 0x1F86F, 0x1F86F ), // Wide-headed south west barb arrow + 'barb2se' => array ( 0x1F86E, 0x1F86E, 0x1F86E, 0x1F86E ), // Wide-headed south east barb arrow + 'barb4left' => array ( 0x1F878, 0x1F878, 0x1F878, 0x1F878 ), // Wide-headed leftwards barb arrow + 'barb4right' => array ( 0x1F87A, 0x1F87A, 0x1F87A, 0x1F87A ), // Wide-headed rightwards barb arrow + 'barb4up' => array ( 0x1F879, 0x1F879, 0x1F879, 0x1F879 ), // Wide-headed upwards barb arrow + 'barb4down' => array ( 0x1F87B, 0x1F87B, 0x1F87B, 0x1F87B ), // Wide-headed downwards barb arrow + 'barb4nw' => array ( 0x1F87C, 0x1F87C, 0x1F87C, 0x1F87C ), // Wide-headed north west barb arrow + 'barb4ne' => array ( 0x1F87D, 0x1F87D, 0x1F87D, 0x1F87D ), // Wide-headed north east barb arrow + 'barb4sw' => array ( 0x1F87F, 0x1F87F, 0x1F87F, 0x1F87F ), // Wide-headed south west barb arrow + 'barb4se' => array ( 0x1F87E, 0x1F87E, 0x1F87E, 0x1F87E ), // Wide-headed south east barb arrow + 'checkbld' => array ( 0x2714, 0x2714, 0x2714, 0x2714 ), // Heavy checkmark + 'diamond' => array ( 0x2666, 0x2666, 0x2666, 0x2666 ), + 'head2left' => array ( 0x2B98, 0x2B98, 0x2B98, 0x2B98 ), + 'head2right' => array ( 0x2B9A, 0x2B9A, 0x2B9A, 0x2B9A ), + 'head2up' => array ( 0x2B99, 0x2B99, 0x2B99, 0x2B99 ), + 'head2down' => array ( 0x2B9B, 0x2B9B, 0x2B9B, 0x2B9B ), + 'lozenge' => array ( 0x2B27, 0x2B27, 0x2B27, 0x2B27 ), + 'lozenge4' => array ( 0x2B27, 0x2B27, 0x2B27, 0x2B27 ), + 'lozenge6' => array ( 0x29EB, 0x29EB, 0x29EB, 0x29EB ), + 'openbullet' => array ( 0x25E6, 0x25E6, 0x25E6, 0x25E6 ), + 'square2' => array ( 0x25AA, 0x25AA, 0x25AA, 0x25AA ), + 'square4' => array ( 0x25AA, 0x25AA, 0x25AA, 0x25AA ), + 'square6' => array ( 0x25A0, 0x25A0, 0x25A0, 0x25A0 ), + 'xrhombus' => array ( 0x2756, 0x2756, 0x2756, 0x2756 ), + + // "Entities" found in some documents, but their name made it difficult to locate the entity reference + // within the PDF file ; their names are not meaningful enough to extrapolate their Unicode equivalent : + // .null + // [aAoO].superior + // allah + // apple + // arrowhorizex + // bari.dotless + // circumflex.arab + // cyrillic_otmark + // dot.one, dot.twohoriz, dot.threeup, dot.twovert, dot.four + // f02d + // Gxx, which do not seem to function as /gxx + // glyphxxx + // Ldot and ldot (didn't found the Unicode name) + // lillah + // noxxx, where 'xxx' is a Greek letter name + // nonmarkingreturn + // patah.wide + // pi1 + // ryial + // smallv + // UIforward + // vdaggerdbl + // wasla + // wavyhamza + // zero.slash + ) ; diff --git a/Maps/unicode-to-ansi.map b/Maps/unicode-to-ansi.map index 9a007d9..3d0a1c8 100755 --- a/Maps/unicode-to-ansi.map +++ b/Maps/unicode-to-ansi.map @@ -1,136 +1,136 @@ - '"', - 0x0085 => '...', - 0x0092 => '"', - 0x0094 => '"', - 0x0096 => '-', - // End Polish - - 0x00A0 => ' ', // Non-breakable space - 0x00AB => '"', // Left pointing double angle quotation mark - 0x00AD => '', // Break Opportunity After: generally provide a line break opportunity after the character - 0x00C6 => 'AE', // AE with ligature (Æ) - 0x00E6 => 'ae', // ae with ligature (æ) - 0x1680 => ' ', // OGHAM space mark - 0x0152 => 'OE', // OE with ligature (Œ) - 0x0153 => 'oe', // oe with ligature (œ) - 0x1D6B => 'ue', // ue with ligature - 0x2000 => ' ', // EN quad - 0x2001 => ' ', // EM quad - 0x2002 => ' ', // EN space - 0x2003 => ' ', // EM space - 0x2004 => ' ', // 3-per-EM space - 0x2005 => ' ', // 4-per-EM space - 0x2006 => ' ', // 6-per-EM space - 0x2007 => ' ', // Figure space - 0x2008 => ' ' , // Punctuation space - 0x2009 => ' ', // Thin s1pace - 0x200A => ' ', // Hair space - 0x200B => ' ', // Zero-width space - 0x200C => ' ', // Zero-width non-joiner - 0x200D => '', // Zero-width joiner - 0x2010 => '-', // Narrow hyphen - 0x2011 => '-', // Non-breaking hyphen - 0x2012 => '-', // Figure dash (has the same width as digits) - 0x2013 => '-', // EN dash (used to indicate range of values) - 0x2014 => ' - ', // EM dash (used to make a break in a flow of sentences) - 0x2015 => '- ', // Horizontal bar, used to introduce quoted text - 0x2018 => "'", // German right single quote - 0x2019 => "'", // Secondary level quotation - 0x201A => "'", // German left single quote - 0x201B => "'", // Reversed quote - 0x201C => '"', // Left double quotation mark - 0x201D => '"', // Double quote-apostrophe - 0x201E => '"', // Lower double quote-apostrophe - 0x2026 => '...', // Ellipsis - 0x2028 => "\n", // Line separator - 0x2029 => "\n", // Paragraph separator - 0x202F => ' ', // Narrow non-break space - 0x2039 => "'", // Single left pointing angle quotation mark - 0x203A => "'", // Single right pointing angle quotation mark - 0x2053 => '~', // Large tilde - 0x205F => ' ', // Medium mathematical space - 0x2060 => '', // Word joiner - 0x207B => '-', // Superscript minus - 0x208B => '-', // Subscript minus - 0x2160 => 'I', // Roman numeral : I - 0x2161 => 'II', // Roman numeral : II - 0x2162 => 'III', // Roman numeral : III - 0x2163 => 'IV', // Roman numeral : IV - 0x2164 => 'V', // Roman numeral : V - 0x2165 => 'VI', // Roman numeral : VI - 0x2166 => 'VII', // Roman numeral : VII - 0x2167 => 'VIII', // Roman numeral : VIII - 0x2168 => 'IX', // Roman numeral : IX - 0x2169 => 'X', // Roman numeral : X - 0x216A => 'XI', // Roman numeral : XI - 0x216B => 'XII', // Roman numeral : XII - 0x216C => 'L', // Roman numeral : L - 0x216D => 'C', // Roman numeral : C - 0x216E => 'D', // Roman numeral : D - 0x216F => 'M', // Roman numeral : M - 0x2170 => 'i', // Roman numeral : i - 0x2171 => 'ii', // Roman numeral : ii - 0x2172 => 'iii', // Roman numeral : iii - 0x2173 => 'iv', // Roman numeral : iv - 0x2174 => 'v', // Roman numeral : v - 0x2175 => 'vi', // Roman numeral : vi - 0x2176 => 'vii', // Roman numeral : vii - 0x2177 => 'viii', // Roman numeral : viii - 0x2178 => 'ix', // Roman numeral : ix - 0x2179 => 'x', // Roman numeral : x - 0x217A => 'xi', // Roman numeral : xi - 0x217B => 'xii', // Roman numeral : xii - 0x217C => 'l', // Roman numeral : l - 0x217D => 'c', // Roman numeral : c - 0x217E => 'd', // Roman numeral : d - 0x217F => 'm', // Roman numeral : m - 0x2212 => '-', // Minus sign (arithmetic operator) - 0x2758 => '|', // Light vertical bar - 0x2759 => '|', // Medium vertical bar - 0x2E3A => '-', // Two-EM dash - 0x2E3B => '-', // Three-EM dash - 0x3000 => ' ', // Ideographic space - 0x301D => '"', // Reversed double prime quotation mark - 0x301E => '"', // Double prime quotation map, - 0x301F => '"', // Low double prime quotation mark - 0xA728 => 'TZ', // TZ with ligature - 0xA729 => 'tz', // tz with ligature - 0xA732 => 'AA', // AA with ligature - 0xA733 => 'aa', // aa with ligature - 0xA734 => 'AO', // AO with ligature - 0xA735 => 'ao', // ao with ligature - 0xA736 => 'AU', // AU with ligature - 0xA737 => 'au', // au with ligature - 0xA738 => 'AV', // AV with ligature - 0xA739 => 'av', // av with ligature - 0xA73A => 'AV', // AV with ligature and bar - 0xA73B => 'av', // av with ligature and bar - 0xA73C => 'AY', // AY with ligature - 0xA73D => 'ay', // ay with ligature - 0xA74E => 'OO', // OO with ligature - 0xA74F => 'oo', // oo with ligature - 0xA760 => 'VY', // VY with ligature - 0xA761 => 'vy', // vy with ligature - 0xFB00 => 'ff', // ff with ligature - 0xFB01 => 'fi', // fi with ligature - 0xFB02 => 'fl', // fl with ligature - 0xFB03 => 'ffi', // ffi with ligature - 0xFB04 => 'ffl', // ffl with ligature - 0xFB05 => 'ft', // ft with ligature - 0xFB06 => 'st', // st with ligature - 0xFF08 => '(', - 0xFF09 => ')', - 0xFE31 => '|', // Vertical em dash - 0xFE32 => '|', // Vertical en dash - 0xFE58 => '-', // Small em dash - 0xFE63 => '-', // Small ASCII hyphen - 0xFF02 => '"', // Full width quotation mark - 0xFF07 => "'", // Full width apostrophe - 0xFF0D => '-', // Full-width hyphen variant of ascii hyphen - 0xFEFF => ' ', // Zero-width non-breaking space - ) ; + '"', + 0x0085 => '...', + 0x0092 => '"', + 0x0094 => '"', + 0x0096 => '-', + // End Polish + + 0x00A0 => ' ', // Non-breakable space + 0x00AB => '"', // Left pointing double angle quotation mark + 0x00AD => '', // Break Opportunity After: generally provide a line break opportunity after the character + 0x00C6 => 'AE', // AE with ligature (Æ) + 0x00E6 => 'ae', // ae with ligature (æ) + 0x1680 => ' ', // OGHAM space mark + 0x0152 => 'OE', // OE with ligature (Œ) + 0x0153 => 'oe', // oe with ligature (œ) + 0x1D6B => 'ue', // ue with ligature + 0x2000 => ' ', // EN quad + 0x2001 => ' ', // EM quad + 0x2002 => ' ', // EN space + 0x2003 => ' ', // EM space + 0x2004 => ' ', // 3-per-EM space + 0x2005 => ' ', // 4-per-EM space + 0x2006 => ' ', // 6-per-EM space + 0x2007 => ' ', // Figure space + 0x2008 => ' ' , // Punctuation space + 0x2009 => ' ', // Thin s1pace + 0x200A => ' ', // Hair space + 0x200B => ' ', // Zero-width space + 0x200C => ' ', // Zero-width non-joiner + 0x200D => '', // Zero-width joiner + 0x2010 => '-', // Narrow hyphen + 0x2011 => '-', // Non-breaking hyphen + 0x2012 => '-', // Figure dash (has the same width as digits) + 0x2013 => '-', // EN dash (used to indicate range of values) + 0x2014 => ' - ', // EM dash (used to make a break in a flow of sentences) + 0x2015 => '- ', // Horizontal bar, used to introduce quoted text + 0x2018 => "'", // German right single quote + 0x2019 => "'", // Secondary level quotation + 0x201A => "'", // German left single quote + 0x201B => "'", // Reversed quote + 0x201C => '"', // Left double quotation mark + 0x201D => '"', // Double quote-apostrophe + 0x201E => '"', // Lower double quote-apostrophe + 0x2026 => '...', // Ellipsis + 0x2028 => "\n", // Line separator + 0x2029 => "\n", // Paragraph separator + 0x202F => ' ', // Narrow non-break space + 0x2039 => "'", // Single left pointing angle quotation mark + 0x203A => "'", // Single right pointing angle quotation mark + 0x2053 => '~', // Large tilde + 0x205F => ' ', // Medium mathematical space + 0x2060 => '', // Word joiner + 0x207B => '-', // Superscript minus + 0x208B => '-', // Subscript minus + 0x2160 => 'I', // Roman numeral : I + 0x2161 => 'II', // Roman numeral : II + 0x2162 => 'III', // Roman numeral : III + 0x2163 => 'IV', // Roman numeral : IV + 0x2164 => 'V', // Roman numeral : V + 0x2165 => 'VI', // Roman numeral : VI + 0x2166 => 'VII', // Roman numeral : VII + 0x2167 => 'VIII', // Roman numeral : VIII + 0x2168 => 'IX', // Roman numeral : IX + 0x2169 => 'X', // Roman numeral : X + 0x216A => 'XI', // Roman numeral : XI + 0x216B => 'XII', // Roman numeral : XII + 0x216C => 'L', // Roman numeral : L + 0x216D => 'C', // Roman numeral : C + 0x216E => 'D', // Roman numeral : D + 0x216F => 'M', // Roman numeral : M + 0x2170 => 'i', // Roman numeral : i + 0x2171 => 'ii', // Roman numeral : ii + 0x2172 => 'iii', // Roman numeral : iii + 0x2173 => 'iv', // Roman numeral : iv + 0x2174 => 'v', // Roman numeral : v + 0x2175 => 'vi', // Roman numeral : vi + 0x2176 => 'vii', // Roman numeral : vii + 0x2177 => 'viii', // Roman numeral : viii + 0x2178 => 'ix', // Roman numeral : ix + 0x2179 => 'x', // Roman numeral : x + 0x217A => 'xi', // Roman numeral : xi + 0x217B => 'xii', // Roman numeral : xii + 0x217C => 'l', // Roman numeral : l + 0x217D => 'c', // Roman numeral : c + 0x217E => 'd', // Roman numeral : d + 0x217F => 'm', // Roman numeral : m + 0x2212 => '-', // Minus sign (arithmetic operator) + 0x2758 => '|', // Light vertical bar + 0x2759 => '|', // Medium vertical bar + 0x2E3A => '-', // Two-EM dash + 0x2E3B => '-', // Three-EM dash + 0x3000 => ' ', // Ideographic space + 0x301D => '"', // Reversed double prime quotation mark + 0x301E => '"', // Double prime quotation map, + 0x301F => '"', // Low double prime quotation mark + 0xA728 => 'TZ', // TZ with ligature + 0xA729 => 'tz', // tz with ligature + 0xA732 => 'AA', // AA with ligature + 0xA733 => 'aa', // aa with ligature + 0xA734 => 'AO', // AO with ligature + 0xA735 => 'ao', // ao with ligature + 0xA736 => 'AU', // AU with ligature + 0xA737 => 'au', // au with ligature + 0xA738 => 'AV', // AV with ligature + 0xA739 => 'av', // av with ligature + 0xA73A => 'AV', // AV with ligature and bar + 0xA73B => 'av', // av with ligature and bar + 0xA73C => 'AY', // AY with ligature + 0xA73D => 'ay', // ay with ligature + 0xA74E => 'OO', // OO with ligature + 0xA74F => 'oo', // oo with ligature + 0xA760 => 'VY', // VY with ligature + 0xA761 => 'vy', // vy with ligature + 0xFB00 => 'ff', // ff with ligature + 0xFB01 => 'fi', // fi with ligature + 0xFB02 => 'fl', // fl with ligature + 0xFB03 => 'ffi', // ffi with ligature + 0xFB04 => 'ffl', // ffl with ligature + 0xFB05 => 'ft', // ft with ligature + 0xFB06 => 'st', // st with ligature + 0xFF08 => '(', + 0xFF09 => ')', + 0xFE31 => '|', // Vertical em dash + 0xFE32 => '|', // Vertical en dash + 0xFE58 => '-', // Small em dash + 0xFE63 => '-', // Small ASCII hyphen + 0xFF02 => '"', // Full width quotation mark + 0xFF07 => "'", // Full width apostrophe + 0xFF0D => '-', // Full-width hyphen variant of ascii hyphen + 0xFEFF => ' ', // Zero-width non-breaking space + ) ; diff --git a/NOTICE b/NOTICE index 725f769..881a6b8 100755 --- a/NOTICE +++ b/NOTICE @@ -1,14 +1,14 @@ -This class package is an extraction from a wider personal project called Thrak, -a sort of PHP framework for Web development and command-line scripting -(https://github.com/christian-vigh/php-thrak). - -I have made my best to extract this class and cut off all of its dependencies -from other Thrak classes without removing functionalities, so that it becomes a -standalone package that you can freely integrate into your projects. There is -no namespace, so that you are free to add your own one if you wish, and my own -error handling mechanism has been replaced with standard exceptions. -All other dependencies have been replaced with core PHP functions or, when -needed, by methods I developed elsewhere and imported into this package. - -Should you have any question or find a bug, please feel free to contact me at : -christian.vigh@orange.fr +This class package is an extraction from a wider personal project called Thrak, +a sort of PHP framework for Web development and command-line scripting +(https://github.com/christian-vigh/php-thrak). + +I have made my best to extract this class and cut off all of its dependencies +from other Thrak classes without removing functionalities, so that it becomes a +standalone package that you can freely integrate into your projects. There is +no namespace, so that you are free to add your own one if you wish, and my own +error handling mechanism has been replaced with standard exceptions. +All other dependencies have been replaced with core PHP functions or, when +needed, by methods I developed elsewhere and imported into this package. + +Should you have any question or find a bug, please feel free to contact me at : +christian.vigh@orange.fr diff --git a/PdfToText.phpclass b/PdfToText.phpclass index 1a69a24..858f28f 100755 --- a/PdfToText.phpclass +++ b/PdfToText.phpclass @@ -1,12996 +1,12996 @@ - Text ; // or : echo ( string ) $pdf ; - - Or : - - $pdf = new PdfToText ( ) ; - // Modify any property here before loading the file ; for example : - // $pdf -> BlockSeparator = " " ; - $pdf -> Load ( 'sample.pdf' ) ; - echo $pdf -> Text ; - - AUTHOR - Christian Vigh, 04/2016. - - HISTORY - [Version : 1.6.7] [Date : 2017/05/31] [Author : CV] - . Added CID fonts - . Changed the way CID font maps are searched and handled - - (...) - - [Version : 1.0] [Date : 2016/04/16] [Author : CV] - Initial version. - - **************************************************************************************************************/ - - -/*============================================================================================================== - - class PdfToTextException et al - - Implements an exception thrown when an error is encountered while decoding PDF files. - - ==============================================================================================================*/ - -// PdfToText exception - -// Base class for all other PdfToText exceptions. -class PdfToTextException extends Exception - { - public static $IsObject = false ; - } ; - - -// PdfToTextDecodingException - -// Thrown when unexpected data is encountered while analyzing PDF contents. -class PdfToTextDecodingException extends PdfToTextException - { - public function __construct ( $message, $object_id = false ) - { - $text = "Pdf decoding error" ; - - if ( $object_id !== false ) - $text .= " (object #$object_id)" ; - - $text .= " : $message" ; - - parent::__construct ( $text ) ; - } - } - - -// PdfToTextDecryptionException - -// Thrown when something unexpected is encountered while processing encrypted data. -class PdfToTextDecryptionException extends PdfToTextException - { - public function __construct ( $message, $object_id = false ) - { - $text = "Pdf decryption error" ; - - if ( $object_id !== false ) - $text .= " (object #$object_id)" ; - - $text .= " : $message" ; - - parent::__construct ( $text ) ; - } - } - - -// PdfToTextTimeoutException - -// Thrown when the PDFOPT_ENFORCE_EXECUTION_TIME or PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME option is set, and -// the script took longer than the allowed execution time limit. -class PdfToTextTimeoutException extends PdfToTextException - { - // Set to true if the reason why the max execution time was reached because of too many invocations of the Load() method - // Set to false if the max execution time was reached by simply processing one PDF file - public $GlobalTimeout ; - - public function __construct ( $message, $global, $php_setting, $class_setting ) - { - $text = "PdfToText max execution time reached " ; - - if ( ! $global ) - $text .= "for one single file " ; - - $text .= "(php limit = {$php_setting}s, class limit = {$class_setting}s) : $message" ; - - $this -> GlobalTimeout = $global ; - - parent::__construct ( $text ) ; - } - } - - -// PdfToTextFormException - -// Thrown if the xml template passed to the GetFormData() method contains an error. -class PdfToTextFormException extends PdfToTextException - { - public function __construct ( $message ) - { - $text = "Pdf form template error" ; - - $text .= " : $message" ; - - parent::__construct ( $text ) ; - } - } - - -// PdfToTextCaptureException - -// Thrown if the xml template passed to the SetCaptures() method contains an error. -class PdfToTextCaptureException extends PdfToTextException - { - public function __construct ( $message ) - { - $text = "Pdf capture template error" ; - - $text .= " : $message" ; - - parent::__construct ( $text ) ; - } - } - - - -/*============================================================================================================== - - Custom error reporting functions. - - ==============================================================================================================*/ -if ( ! function_exists ( 'warning' ) ) - { - function warning ( $message ) - { - trigger_error ( $message, E_USER_WARNING ) ; - } - } - - -if ( ! function_exists ( 'error' ) ) - { - function error ( $message ) - { - if ( is_string ( $message ) ) - trigger_error ( $message, E_USER_ERROR ) ; - else if ( is_a ( $message, '\Exception' ) ) - throw $message ; - } - } - - -/*============================================================================================================== - - Backward-compatibility issues. - - ==============================================================================================================*/ - -// hex2bin - -// This function appeared only in version 5.4.0 -if ( ! function_exists ( 'hex2bin' ) ) - { - function hex2bin ( $hexstring ) - { - $length = strlen ( $hexstring ) ; - $binstring = '' ; - $index = 0 ; - - while ( $index < $length ) - { - $byte = substr ( $hexstring, $index, 2 ) ; - $ch = pack ( 'H*', $byte ) ; - $binstring .= $ch ; - - $index += 2 ; - } - - return ( $binstring ) ; - } - - } - - -/*============================================================================================================== - - class PfObjectBase - - Base class for all PDF objects defined here. - - ==============================================================================================================*/ -abstract class PdfObjectBase // extends Object - { - // Possible encoding types for streams inside objects ; "unknown" means that the object contains no stream - const PDF_UNKNOWN_ENCODING = 0 ; // No stream decoding type could be identified - const PDF_ASCIIHEX_ENCODING = 1 ; // AsciiHex encoding - not tested - const PDF_ASCII85_ENCODING = 2 ; // Ascii85 encoding - not tested - const PDF_FLATE_ENCODING = 3 ; // Flate/deflate encoding - const PDF_TEXT_ENCODING = 4 ; // Stream data appears in clear text - no decoding required - const PDF_LZW_ENCODING = 5 ; // Not implemented yet - const PDF_RLE_ENCODING = 6 ; // Runtime length encoding ; not implemented yet - const PDF_DCT_ENCODING = 7 ; // JPEG images - const PDF_CCITT_FAX_ENCODING = 8 ; // CCITT Fax encoding - not implemented yet - const PDF_JBIG2_ENCODING = 9 ; // JBIG2 filter encoding (black/white) - not implemented yet - const PDF_JPX_ENCODING = 10 ; // JPEG2000 encoding - not implemented yet - - // Regular expression used for recognizing references to a font (this list is far from being exhaustive, as it seems - // that you can specify almost everything - however, trying to recognize everything would require to develop a complete - // parser) - protected static $FontSpecifiers = ' - (/F \d+ (\.\d+)? ) | - (/R \d+) | - (/f-\d+-\d+) | - (/[CT]\d+_\d+) | - (/TT \d+) | - (/OPBaseFont \d+) | - (/OPSUFont \d+) | - (/[0-9a-zA-Z]) | - (/F\w+) | - (/[A-Za-z][A-Za-z0-9]* ( [\-+] [A-Za-z][A-Za-z0-9]* )) - ' ; - - // Maps alien Unicode characters such as special spaces, letters with ligatures to their ascii string equivalent - protected static $UnicodeToSimpleAscii = false ; - - - /*-------------------------------------------------------------------------------------------------------------- - - Constructor - - Performs static initializations such as the Unicode to Ascii table. - - *-------------------------------------------------------------------------------------------------------------*/ - public function __construct ( ) - { - if ( self::$UnicodeToSimpleAscii === false ) - { - $charset_file = dirname ( __FILE__ ) . "/Maps/unicode-to-ansi.map" ; - include ( $charset_file ) ; - self::$UnicodeToSimpleAscii = ( isset ( $unicode_to_ansi ) ) ? $unicode_to_ansi : array ( ) ; - } - - // parent::__construct ( ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - CodePointToUtf8 - Encodes a Unicode codepoint to UTF8. - - PROTOTYPE - $char = $this -> CodePointToUtf8 ( $code ) ; - - DESCRIPTION - Encodes a Unicode codepoint to UTF8, trying to handle all possible cases. - - PARAMETERS - $code (integer) - - Unicode code point to be translated. - - RETURN VALUE - A string that contains the UTF8 bytes representing the Unicode code point. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function CodePointToUtf8 ( $code ) - { - if ( $code ) - { - $result = '' ; - - while ( $code ) - { - $word = ( $code & 0xFFFF ) ; - - if ( ! isset ( self::$UnicodeToSimpleAscii [ $word ] ) ) - { - $entity = "&#$word;" ; - $result .= mb_convert_encoding ( $entity, 'UTF-8', 'HTML-ENTITIES' ) . $result ; - } - else - $result .= self::$UnicodeToSimpleAscii [ $word ] ; - - $code = ( integer ) ( $code / 0xFFFF ) ; // There is no unsigned right-shift operator in PHP... - } - - return ( $result ) ; - } - // No translation is apparently possible : use a placeholder to signal this situation - else - { - if ( strpos ( PdfToText::$Utf8Placeholder, '%' ) === false ) - { - return ( PdfToText::$Utf8Placeholder ) ; - } - else - return ( sprintf ( PdfToText::$Utf8Placeholder, $code ) ) ; - } - } - - - /*-------------------------------------------------------------------------------------------------------------- - - DecodeRawName - - Decodes a string that may contain constructs such as '#xy', where 'xy' are hex digits. - - *-------------------------------------------------------------------------------------------------------------*/ - public static function DecodeRawName ( $str ) - { - return ( rawurldecode ( str_replace ( '#', '%', $str ) ) ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - GetEncodingType - Gets an object encoding type. - - PROTOTYPE - $type = $this -> GetEncodingType ( $object_id, $object_data ) ; - - DESCRIPTION - When an object is a stream, returns its encoding type. - - PARAMETERS - $object_id (integer) - - PDF object number. - - $object_data (string) - - Object contents. - - RETURN VALUE - Returns one of the following values : - - - PdfToText::PDF_ASCIIHEX_ENCODING : - Hexadecimal encoding of the binary values. - Decoding algorithm was taken from the unknown contributor and not tested so far, since I - couldn't find a PDF file with such an encoding type. - - - PdfToText::PDF_ASCII85_ENCODING : - Obscure encoding format. - Decoding algorithm was taken from the unknown contributor and not tested so far, since I - couldn't find a PDF file with such an encoding type. - - - PdfToText::PDF_FLATE_ENCODING : - gzip/deflate encoding. - - - PdfToText::PDF_TEXT_ENCODING : - Stream data is unencoded (ie, it is pure ascii). - - - PdfToText::PDF_UNKNOWN_ENCODING : - The object data does not specify any encoding at all. It can happen on objects that do not have - a "stream" part. - - - PdfToText::PDF_DCT_ENCODING : - a lossy filter based on the JPEG standard. - - The following constants are defined but not yet implemented ; an exception will be thrown if they are - encountered somewhere in the PDF file : - - - PDF_LZW_ENCODING : - a filter based on LZW Compression; it can use one of two groups of predictor functions for more - compact LZW compression : Predictor 2 from the TIFF 6.0 specification and predictors (filters) - from the PNG specification - - - PDF_RLE_ENCODING : - a simple compression method for streams with repetitive data using the run-length encoding - algorithm and the image-specific filters. - - PDF_CCITT_FAX_ENCODING : - a lossless bi-level (black/white) filter based on the Group 3 or Group 4 CCITT (ITU-T) fax - compression standard defined in ITU-T T.4 and T.6. - - PDF_JBIG2_ENCODING : - a lossy or lossless bi-level (black/white) filter based on the JBIG2 standard, introduced in - PDF 1.4. - - PDF_JPX_ENCODING : - a lossy or lossless filter based on the JPEG 2000 standard, introduced in PDF 1.5. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function GetEncodingType ( $object_id, $object_data ) - { - $status = preg_match ( '# / (?P (ASCIIHexDecode) | (AHx) | (ASCII85Decode) | (A85) | (FlateDecode) | (Fl) | (DCTDecode) | (DCT) | ' . - '(LZWDecode) | (LZW) | (RunLengthDecode) | (RL) | (CCITTFaxDecode) | (CCF) | (JBIG2Decode) | (JPXDecode) ) \b #imsx', - $object_data, $match ) ; - - if ( ! $status ) - return ( self::PDF_TEXT_ENCODING ) ; - - switch ( strtolower ( $match [ 'encoding' ] ) ) - { - case 'asciihexdecode' : - case 'ahx' : return ( self::PDF_ASCIIHEX_ENCODING ) ; - - case 'ascii85decode' : - case 'a85' : return ( self::PDF_ASCII85_ENCODING ) ; - - case 'flatedecode' : - case 'fl' : return ( self::PDF_FLATE_ENCODING ) ; - - case 'dctdecode' : - case 'dct' : return ( self::PDF_DCT_ENCODING ) ; - - case 'lzwdecode' : - case 'lzw' : return ( self::PDF_LZW_ENCODING ) ; - - case 'ccittfaxdecode' : - case 'ccf' : - - case 'runlengthdecode' : - case 'rl' : - - case 'jbig2decode' : - - case 'jpxdecode' : - if ( PdfToText::$DEBUG > 1 ) - warning ( "Encoding type \"{$match [ 'encoding' ]}\" not yet implemented for pdf object #$object_id." ) ; - - default : return ( self::PDF_UNKNOWN_ENCODING ) ; - } - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - GetObjectReferences - Gets object references from a specified construct. - - PROTOTYPE - $status = $this -> GetObjectReferences ( $object_id, $object_data, $searched_string, &$object_ids ) ; - - DESCRIPTION - Certain parameter specifications are followed by an object reference of the form : - x 0 R - but it can also be an array of references : - [x1 0 R x2 0 R ... xn 0 r] - Those kind of constructs can occur after parameters such as : /Pages, /Contents, /Kids... - This method extracts the object references found in such a construct. - - PARAMETERS - $object_id (integer) - - Id of the object to be analyzed. - - $object_data (string) - - Object contents. - - $searched_string (string) - - String to be searched, that must be followed by an object or an array of object references. - This parameter can contain constructs used in regular expressions. Note however that the '#' - character must be escaped, since it is used as a delimiter in the regex that is applied on - object data. - - $object_ids (array of integers) - - Returns on output the ids of the pdf object that have been found after the searched string. - - RETURN VALUE - True if the searched string has been found and is followed by an object or array of object references, - false otherwise. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function GetObjectReferences ( $object_id, $object_data, $searched_string, &$object_ids ) - { - $status = true ; - $object_ids = array ( ) ; - - if ( preg_match ( "#$searched_string \s* \\[ (?P [^\]]+ ) \\]#ix", $object_data, $match ) ) - { - $object_list = $match [ 'objects' ] ; - - if ( preg_match_all ( '/(?P \d+) \s+ \d+ \s+ R/x', $object_list, $matches ) ) - { - foreach ( $matches [ 'object' ] as $id ) - $object_ids [] = ( integer ) $id ; - } - else - $status = false ; - } - else if ( preg_match ( "#$searched_string \s+ (?P \d+) \s+ \d+ \s+ R#ix", $object_data, $match ) ) - { - $object_ids [] = ( integer ) $match [ 'object' ] ; - } - else - $status = false ; - - return ( $status ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - GetStringParameter - Retrieve a string flag value. - - PROTOTYPE - $result = $this -> GetStringParameter ( $parameter, $object_data ) ; - - DESCRIPTION - Retrieves the value of a string parameter ; for example : - - /U (parameter value) - - or : - - /U - - PARAMETERS - $parameter (string) - - Parameter name. - - $object_data (string) - - Object containing the parameter. - - RETURN VALUE - The parameter value. - - NOTES - description - - *-------------------------------------------------------------------------------------------------------------*/ - protected function GetStringParameter ( $parameter, $object_data ) - { - if ( preg_match ( '#' . $parameter . ' \s* \( \s* (?P [^)]+) \)#ix', $object_data, $match ) ) - $result = $this -> ProcessEscapedString ( $match [ 'value' ] ) ; - else if ( preg_match ( '#' . $parameter . ' \s* \< \s* (?P [^>]+) \>#ix', $object_data, $match ) ) - { - $hexdigits = $match [ 'value' ] ; - $result = '' ; - - for ( $i = 0, $count = strlen ( $hexdigits ) ; $i < $count ; $i += 2 ) - $result .= chr ( hexdec ( substr ( $hexdigits, $i, 2 ) ) ) ; - } - else - $result = '' ; - - return ( $result ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - GetUTCDate - - Reformats an Adobe UTC date to a format that can be understood by the strtotime() function. - Dates are specified in the following format : - D:20150521154000Z - D:20160707182114+02 - with are both recognized by strtotime(). However, another format can be specified : - D:20160707182114+02'00' - which is not recognized by strtotime() so we have to get rid from the '00' part. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function GetUTCDate ( $date ) - { - if ( $date ) - { - if ( ( $date [0] == 'D' || $date [0] == 'd' ) && $date [1] == ':' ) - $date = substr ( $date, 2 ) ; - - if ( ( $index = strpos ( $date, "'" ) ) !== false ) - $date = substr ( $date, 0, $index ) ; - } - - return ( $date ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - IsCharacterMap - - Checks if the specified text contents represent a character map definition or not. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function IsCharacterMap ( $decoded_data ) - { - // preg_match is faster than calling strpos several times - return ( preg_match ( '#(begincmap)|(beginbfrange)|(beginbfchar)|(/Differences)#ix', $decoded_data ) ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - IsFont - - Checks if the current object contents specify a font declaration. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function IsFont ( $object_data ) - { - return - ( - stripos ( $object_data, '/BaseFont' ) !== false || - ( ! preg_match ( '#/Type \s* /FontDescriptor#ix', $object_data ) && - preg_match ( '#/Type \s* /Font#ix', $object_data ) ) - ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - IsFormData - - Checks if the current object contents specify references to font data. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function IsFormData ( $object_data ) - { - return - ( - preg_match ( '#\bR \s* \( \s* datasets \s* \)#imsx', $object_data ) - ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - IsFontMap - - Checks if the code contains things like : - <> - which maps font 1 (when specified with the /Fx instruction) to object 26, 2 to object 22 and 3 to - object 18, respectively, in the above example. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function IsFontMap ( $object_data ) - { - $object_data = self::UnescapeHexCharacters ( $object_data ) ; - - if ( preg_match ( '#<< \s* ( ' . self::$FontSpecifiers . ' ) \s+ .* >>#imsx', $object_data ) ) - return ( true ) ; - else - return ( false ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - IsImage - - Checks if the code contains things like : - /Subtype/Image - - *-------------------------------------------------------------------------------------------------------------*/ - protected function IsImage ( $object_data ) - { - if ( preg_match ( '#/Subtype \s* /Image#msx', $object_data ) ) - return ( true ) ; - else - return ( false ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - IsObjectStream - - Checks if the code contains an object stream (/Type/ObjStm) - /Subtype/Image - - *-------------------------------------------------------------------------------------------------------------*/ - protected function IsObjectStream ( $object_data ) - { - if ( preg_match ( '#/Type \s* /ObjStm#isx', $object_data ) ) - return ( true ) ; - else - return ( false ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - IsPageHeaderOrFooter - Check if the specified object contents denote a text stream. - - PROTOTYPE - $status = $this -> IsPageHeaderOrFooter ( $stream_data ) ; - - DESCRIPTION - Checks if the specified decoded stream contents denotes header or footer data. - - PARAMETERS - $stream_data (string) - - Decoded stream contents. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function IsPageHeaderOrFooter ( $stream_data ) - { - if ( preg_match ( '#/Type \s* /Pagination \s* /Subtype \s*/((Header)|(Footer))#ix', $stream_data ) ) - return ( true ) ; - else if ( preg_match ( '#/Attached \s* \[ .*? /((Top)|(Bottom)) [^]]#ix', $stream_data ) ) - return ( true ) ; - else - return ( false ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - IsText - Check if the specified object contents denote a text stream. - - PROTOTYPE - $status = $this -> IsText ( $object_data, $decoded_stream_data ) ; - - DESCRIPTION - Checks if the specified object contents denote a text stream. - - PARAMETERS - $object_data (string) - - Object data, ie the contents located between the "obj" and "endobj" keywords. - - $decoded_stream_data (string) - - The flags specified in the object data are not sufficient to be sure that we have a block of - drawing instructions. We must also check for certain common instructions to be present. - - RETURN VALUE - True if the specified contents MAY be text contents, false otherwise. - - NOTES - I do not consider this method as bullet-proof. There may arise some cases where non-text blocks can be - mistakenly considered as text blocks, so it is subject to evolve in the future. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function IsText ( $object_data, $decoded_stream_data ) - { - if ( preg_match ( '# / (Filter) | (Length) #ix', $object_data ) && - ! preg_match ( '# / (Type) | (Subtype) | (Length1) #ix', $object_data ) ) - { - if ( preg_match ( '/\\b(BT|Tf|Td|TJ|Tj|Tm|Do|cm)\\b/', $decoded_stream_data ) ) - return ( true ) ; - } - else if ( preg_match ( '/\\b(BT|Tf|Td|TJ|Tj|Tm|Do|cm)\\b/', $decoded_stream_data ) ) - return ( true ) ; - - return ( false ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - PregStrReplace - Replace string(s) using regular expression(s) - - PROTOTYPE - $result = PdfToText::PregStrReplace ( $pattern, $replacement, $subject, $limit = -1, - &$match_count = null ) - - DESCRIPTION - This function behaves like a mix of str_replace() and preg_replace() ; it allows to search for strings - using regular expressions, but the replacements are plain-text strings and no reference to a capture - specified in the regular expression will be interpreted. - This is useful when processing templates, which can contain constructs such as "\00" or "$", which are - interpreted by preg_replace() as references to captures. - - The function has the same parameters as preg_replace(). - - RETURN VALUE - Returns the substituted text. - - *-------------------------------------------------------------------------------------------------------------*/ - public static function PregStrReplace ( $pattern, $replacement, $subject, $limit = -1, &$match_count = null ) - { - // Make sure that $pattern and $replacement become arrays of the same size - if ( is_array ( $pattern ) ) - { - if ( is_array ( $replacement ) ) - { - if ( count ( $pattern ) !== count ( $replacement ) ) - { - warning ( "The \$replacement parameter should have the same number of element as \$pattern." ) ; - return ( $subject ) ; - } - } - else - $replacement = array_fill ( $replacement, count ( $pattern ), $replacement ) ; - } - else - { - if ( is_array ( $replacement ) ) - { - warning ( "Expected string for the \$replacement parameter." ) ; - return ( $subject ) ; - } - - $pattern = array ( $pattern ) ; - $replacement = array ( $replacement ) ; - } - - // Upper limit - if ( $limit < 1 ) - $limit = PHP_INT_MAX ; - - // Loop through each supplied pattern - $current_subject = $subject ; - $count = 0 ; - - for ( $i = 0, $pattern_count = count ( $pattern ) ; $i < $pattern_count ; $i ++ ) - { - $regex = $pattern [$i] ; - - // Get all matches for this pattern - if ( preg_match_all ( $regex, $current_subject, $matches, PREG_OFFSET_CAPTURE ) ) - { - $result = '' ; // Current output result - $last_offset = 0 ; - - // Process each match - foreach ( $matches [0] as $match ) - { - $offset = ( integer ) $match [1] ; - - // Append data from the last seen offset up to the current one - if ( $last_offset < $offset ) - $result .= substr ( $current_subject, $last_offset, $offset - $last_offset ) ; - - // Append the replacement string for this match - $result .= $replacement [$i] ; - - // Compute next offset in $current_subject - $last_offset = $offset + strlen ( $match [0] ) ; - - // Limit checking - $count ++ ; - - if ( $count > $limit ) - break 2 ; - } - - // Append the last part of the subject that has not been matched by anything - $result .= substr ( $current_subject, $last_offset ) ; - - // The current subject becomes the string that has been built in the steps above - $current_subject = $result ; - } - } - - /// All done, return - return ( $current_subject ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - ProcessEscapedCharacter - Interprets a character after a backslash in a string. - - PROTOTYPE - $ch = $this -> ProcessEscapedCharacter ( $ch ) ; - - DESCRIPTION - Interprets a character after a backslash in a string and returns the interpreted value. - - PARAMETERS - $ch (char) - - Character to be escaped. - - RETURN VALUE - The escaped character. - - NOTES - This method does not process octal sequences. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function ProcessEscapedCharacter ( $ch ) - { - switch ( $ch ) - { - // Normally, only a few characters should be escaped... - case '(' : $newchar = "(" ; break ; - case ')' : $newchar = ")" ; break ; - case '[' : $newchar = "[" ; break ; - case ']' : $newchar = "]" ; break ; - case '\\' : $newchar = "\\" ; break ; - case 'n' : $newchar = "\n" ; break ; - case 'r' : $newchar = "\r" ; break ; - case 'f' : $newchar = "\f" ; break ; - case 't' : $newchar = "\t" ; break ; - case 'b' : $newchar = chr ( 8 ) ; break ; - case 'v' : $newchar = chr ( 11 ) ; break ; - - // ... but should we consider that it is a heresy to escape other characters ? - // For the moment, no. - default : $newchar = $ch ; break ; - } - - return ( $newchar ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - ProcessEscapedString - Processes a string which can have escaped characters. - - PROTOTYPE - $result = $this -> ProcessEscapedString ( $str, $process_octal_escapes = false ) ; - - DESCRIPTION - Processes a string which may contain escape sequences. - - PARAMETERS - $str (string) - - String to be processed. - - $process_octal_escapes (boolean) - - When true, octal escape sequences such as \037 are processed. - - RETURN VALUE - The processed input string. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function ProcessEscapedString ( $str, $process_octal_escapes = false ) - { - $length = strlen ( $str ) ; - $offset = 0 ; - $result = '' ; - $ord0 = ord ( '0' ) ; - - while ( ( $backslash_index = strpos ( $str, '\\', $offset ) ) !== false ) - { - if ( $backslash_index + 1 < $length ) - { - $ch = $str [ ++ $backslash_index ] ; - - if ( ! $process_octal_escapes ) - { - $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) . $this -> ProcessEscapedCharacter ( $ch ) ; - $offset = $backslash_index + 1 ; - } - else if ( $ch < '0' || $ch > '7' ) - { - $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) . $this -> ProcessEscapedCharacter ( $ch ) ; - $offset = $backslash_index + 1 ; - } - else - { - $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) ; - $ord = ord ( $ch ) - $ord0 ; - $count = 0 ; - $backslash_index ++ ; - - while ( $backslash_index < $length && $count < 2 && - $str [ $backslash_index ] >= '0' && $str [ $backslash_index ] <= '7' ) - { - $ord = ( $ord * 8 ) + ( ord ( $str [ $backslash_index ++ ] ) - $ord0 ) ; - $count ++ ; - } - - $result .= chr ( $ord ) ; - $offset = $backslash_index ; - } - } - else - break ; - } - - $result .= substr ( $str, $offset ) ; - - return ( $result ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - Unescape - Processes escape sequences from the specified string. - - PROTOTYPE - $value = $this -> Unescape ( $text ) ; - - DESCRIPTION - Processes escape sequences within the specified text. The recognized escape sequences are like the - C-language ones : \b (backspace), \f (form feed), \r (carriage return), \n (newline), \t (tab). - All other characters prefixed by "\" are returned as is. - - PARAMETERS - $text (string) - - Text to be unescaped. - - RETURN VALUE - Returns the unescaped value of $text. - - *-------------------------------------------------------------------------------------------------------------*/ - public static function Unescape ( $text ) - { - $length = strlen ( $text ) ; - $result = '' ; - $ord0 = ord ( 0 ) ; - - for ( $i = 0 ; $i < $length ; $i ++ ) - { - $ch = $text [$i] ; - - if ( $ch == '\\' && isset ( $text [$i+1] ) ) - { - $nch = $text [++$i] ; - - switch ( $nch ) - { - case 'b' : $result .= "\b" ; break ; - case 't' : $result .= "\t" ; break ; - case 'f' : $result .= "\f" ; break ; - case 'r' : $result .= "\r" ; break ; - case 'n' : $result .= "\n" ; break ; - default : - // Octal escape notation - if ( $nch >= '0' && $nch <= '7' ) - { - $ord = ord ( $nch ) - $ord0 ; - $digits = 1 ; - $i ++ ; - - while ( $i < $length && $digits < 3 && $text [$i] >= '0' && $text [$i] <= '7' ) - { - $ord = ( $ord * 8 ) + ord ( $text [$i] ) - $ord0 ; - $i ++ ; - $digits ++ ; - } - - $i -- ; // Count one character less since $i will be incremented at the end of the for() loop - - $result .= chr ( $ord ) ; - } - else - $result .= $nch ; - } - } - else - $result .= $ch ; - } - - return ( $result ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - UnescapeHexCharacters - Unescapes characters in the #xy notation. - - PROTOTYPE - $result = $this -> UnescapeHexCharacters ( $data ) ; - - DESCRIPTION - Some specifications contain hex characters specified as #xy. For the moment, I have met such a construct in - font aliases such as : - /C2#5F0 25 0 R - where "#5F" stands for "_", giving : - /C2_0 25 0 R - Hope that such constructs do not happen in other places... - - PARAMETERS - $data (string) - - String to be unescaped. - - RETURN VALUE - The input string with all the hex character representations replaced with their ascii equivalent. - - *-------------------------------------------------------------------------------------------------------------*/ - public static function UnescapeHexCharacters ( $data ) - { - if ( strpos ( $data, 'stream' ) === false && preg_match ( '/(?P \# [0-9a-f] [0-9a-f])/ix', $data ) ) - { - preg_match_all ( '/(?P \# [0-9a-f] [0-9a-f])/ix', $data, $matches ) ; - - $searches = array ( ) ; - $replacements = array ( ) ; - - foreach ( $matches [ 'hex' ] as $hex ) - { - if ( ! isset ( $searches [ $hex ] ) ) - { - $searches [ $hex ] = $hex ; - $replacements [] = chr ( hexdec ( substr ( $hex, 1 ) ) ) ; - } - - $data = str_replace ( $searches, $replacements, $data ) ; - } - } - - return ( $data ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - ValidatePhpName - - Checks that the specified name (declared in the XML template) is a valid PHP name. - - *-------------------------------------------------------------------------------------------------------------*/ - public static function ValidatePhpName ( $name ) - { - $name = trim ( $name ) ; - - if ( ! preg_match ( '/^ [a-z_][a-z0-9_]* $/ix', $name ) ) - error ( new PdfToTextFormException ( "Invalid PHP name \"$name\"." ) ) ; - - return ( $name ) ; - } - } - - -/*============================================================================================================== - - PdfToText class - - A class for extracting text from Pdf files. - - ==============================================================================================================*/ -class PdfToText extends PdfObjectBase - { - // Current version of the class - const VERSION = "1.6.7" ; - - // Pdf processing options - const PDFOPT_NONE = 0x00000000 ; // No extra option - const PDFOPT_REPEAT_SEPARATOR = 0x00000001 ; // Repeats the Separator property if the offset between two text blocks (in array notation) - // is greater than $this -> MinSpaceWidth - const PDFOPT_GET_IMAGE_DATA = 0x00000002 ; // Retrieve raw image data in the $ths -> ImageData array - const PDFOPT_DECODE_IMAGE_DATA = 0x00000004 ; // Creates a jpeg resource for each image - const PDFOPT_IGNORE_TEXT_LEADING = 0x00000008 ; // Ignore text leading values - const PDFOPT_NO_HYPHENATED_WORDS = 0x00000010 ; // Join hyphenated words that are split on two lines - const PDFOPT_AUTOSAVE_IMAGES = 0x00000020 ; // Autosave images ; the ImageFileTemplate property will need to be defined - const PDFOPT_ENFORCE_EXECUTION_TIME = 0x00000040 ; // Enforces the max_execution_time PHP setting when processing a file. A PdfTexterTimeoutException - // will be thrown if processing of a single file reaches (time_limit - 1 second) by default - // The MaxExecutionTime property can be set to modify this default value. - const PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME = 0x00000080 ; // Same as PDFOPT_ENFORCE_EXECUTION_TIME, but for all calls to the Load() method of the PdfToText class - // The MaxGlobalExecutionTime static property can be set to modify the default time limit - const PDFOPT_IGNORE_HEADERS_AND_FOOTERS = 0x00000300 ; // Ignore headers and footers - - const PDFOPT_RAW_LAYOUT = 0x00000000 ; // Layout rendering : raw (default) - const PDFOPT_BASIC_LAYOUT = 0x00000400 ; // Layout rendering : basic - - const PDFOPT_LAYOUT_MASK = 0x00000C00 ; // Mask to isolate the targeted layout - - const PDFOPT_ENHANCED_STATISTICS = 0x00001000 ; // Compute statistics on PDF language instructions - const PDFOPT_DEBUG_SHOW_COORDINATES = 0x00002000 ; // Include text coordinates ; implies the PDFOPT_BASIC_LAYOUT option - // This option can be useful if you want to use capture areas and get information about - // their coordinates - const PDFOPT_CAPTURE = 0x00004000 ; // Indicates that the caller wants to capture some text and use the SetCaptures() method - // It currently enables the PDFOPT_BASIC_LAYOUT option - const PDFOPT_LOOSE_X_CAPTURE = 0x00008000 ; // Includes in captures text fragments whose dimensions may exceed the captured area dimensions - const PDFOPT_LOOSE_Y_CAPTURE = 0x00010000 ; // (currently not used) - - // When boolean true, outputs debug information about fonts, character maps and drawing contents. - // When integer > 1, outputs additional information about other objects. - public static $DEBUG = false ; - - // Current filename - public $Filename = false ; - // Extracted text - public $Text = '' ; - // Document pages (array of strings) - public $Pages = array ( ) ; - // Document images (array of PdfImage objects) - public $Images = array ( ) ; - protected $ImageCount = 0 ; - // Raw data for document images - public $ImageData = array ( ) ; - // ImageAutoSaveFileTemplate : - // Template for the file names to be generated when extracting images, if the PDFOPT_AUTOSAVE_IMAGES has been specified. - // Can contain any path, plus the following printf()-like modifiers : - // . "%p" : Path of the original PDF file. - // . "%f" : Filename part of the original PDF file. - // . "%d" : A sequential number, starting from 1, used when generating filenames. The format can contains a width specifier, - // such as "%3d", which will generate 3-digits sequential numbers left-filled with zeroes. - // . "%s" : Image suffix, which will automatically based on the underlying image type. - public $ImageAutoSaveFileTemplate = "%p/%f.%d.%s" ; - // Auto-save image file format - public $ImageAutoSaveFormat = IMG_JPEG ; - // Auto-saved image file names - public $AutoSavedImageFiles = array ( ) ; - // Text chunk separator (used to separate blocks of text specified as an array notation) - public $BlockSeparator = '' ; - // Separator used to separate text groups where the offset value is less than -1000 thousands of character units - // (eg : [(1)-1822(2)] will add a separator between the characters "1" and "2") - // Note that such values are expressed in thousands of text units and subtracted from the current position. A - // negative value means adding more space between the two text units it separates. - public $Separator = ' ' ; - // Separator to be used between pages in the $Text property - public $PageSeparator = "\n" ; - // Minimum value (in 1/1000 of text units) that separates two text chunks that can be considered as a real space - public $MinSpaceWidth = 200 ; - // Pdf options - public $Options = self::PDFOPT_NONE ; - // Maximum number of pages to extract from the PDF. A zero value means "extract everything" - // If this number is negative, then the pages to be extract start from the last page. For example, a value of -2 - // extracts the last two pages - public $MaxSelectedPages = false ; - // Maximum number of images to be extracted. A value of zero means "extract everything". A non-zero value gives - // the number of images to extract. - public $MaxExtractedImages = false ; - // Location of the CID tables directory - public static $CIDTablesDirectory ; - // Loacation of the Font metrics directory, for the Adobe standard 14 fonts - public static $FontMetricsDirectory ; - // Standard Adobe font names, and their corresponding file in $FontMetricsDirectory - public static $AdobeStandardFontMetrics = array - ( - 'courier' => 'courier.fm', - 'courier-bold' => 'courierb.fm', - 'courier-oblique' => 'courieri.fm', - 'courier-boldoblique' => 'courierbi.fm', - 'helvetica' => 'helvetica.fm', - 'helvetica-bold' => 'helveticab.fm', - 'helvetica-oblique' => 'helveticai.fm', - 'helvetica-boldoblique' => 'helveticabi.fm', - 'symbol' => 'symbol.fm', - 'times-roman' => 'times.fm', - 'times-bold' => 'timesb.fm', - 'times-bolditalic' => 'timesbi.fm', - 'times-italic' => 'timesi.fm', - 'zapfdingbats' => 'zapfdingbats.fm' - ) ; - // Author information - public $Author = '' ; - public $CreatorApplication = '' ; - public $ProducerApplication = '' ; - public $CreationDate = '' ; - public $ModificationDate = '' ; - public $Title = '' ; - public $Subject = '' ; - public $Keywords = '' ; - protected $GotAuthorInformation = false ; - // Unique and arbitrary file identifier, as specified in the PDF file - // Well, in fact, there are two IDs, but the PDF specification does not mention the goal of the second one - public $ID = '' ; - public $ID2 = '' ; - // End of line string - public $EOL = PHP_EOL ; - // String to be used when no Unicode translation is possible - public static $Utf8Placeholder = '' ; - // Information about memory consumption implied by the file currently being loaded - public $MemoryUsage, - $MemoryPeakUsage ; - // Offset of the document start (%PDF-x.y) - public $DocumentStartOffset ; - // Debug statistics - public $Statistics = array ( ) ; - // Max execution time settings. A positive value means "don't exceed that number of seconds". - // A negative value means "Don't exceed PHP setting max_execution_time - that number of seconds". If the result - // is negative, then the default will be "max_execution_time - 1". - // For those limits to be enforced, you need to specify either the PDFOPT_ENFORCE_EXECUTION_TIME or - // PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME options, or both - public $MaxExecutionTime = -1 ; - public static $MaxGlobalExecutionTime = -1 ; - // This property is expressed in percents ; it gives the extra percentage to add to the values computed by - // the PdfTexterFont::GetStringWidth() method. - // This is basically used when computing text positions and string lengths with the PDFOPT_BASIC_LAYOUT option : - // the computed string length is shorter than its actual length (because of extra spacing determined by character - // kerning in the font data). To determine whether two consecutive blocks of text should be separated by a space, - // we empirically add this extra percentage to the computed string length. The default is -5%. - public $ExtraTextWidth = -5 ; - - // Marker stuff. The unprocessed marker list is a sequential array of markers, which will later be dispatched into - // indexed arrays during their first reference - protected $UnprocessedMarkerList = array ( 'font' => array ( ) ) ; - protected $TextWithFontMarkers = array ( ) ; - - // Internal variables used when the PDFOPT_ENFORCE_* options are specified - protected static $PhpMaxExecutionTime ; - protected static $GlobalExecutionStartTime ; - protected static $AllowedGlobalExecutionTime ; - protected $ExecutionStartTime ; - protected $AllowedExecutionTime ; - - // Font mappings - protected $FontTable = false ; - // Extra Adobe standard font mappings (for character names of the form "/axxx" for example) - protected $AdobeExtraMappings = array ( ) ; - // Page map object - protected $PageMap ; - // Page locations (start and end offsets) - protected $PageLocations ; - // Encryption data - public $IsEncrypted = false ; - protected $EncryptionData = false ; - // A flag coming from the constructor options, telling if enhanced statistics are enabled - protected $EnhancedStatistics ; - - // Document text fragments, with their absolute (x,y) position, approximate width and height - protected $DocumentFragments ; - - // Form data - protected $FormData ; - protected $FormDataObjectNumbers ; - protected $FormDataDefinitions ; - protected $FormaDataObjects ; - - // Capture data - public $CaptureDefinitions ; - protected $CaptureObject ; - - // Indicates whether global static initializations have been made - // This is mainly used for variables such as $Utf8PlaceHolder, which is initialized to a different value - private static $StaticInitialized = false ; - - // Drawing instructions that are to be ignored and removed from a text stream before processing, for performance - // reasons (it is faster to call preg_replace() once to remove them than calling the __next_instruction() and - // __next_token() methods to process an input stream containing such useless instructions) - // This is an array of regular expressions where the following constructs are replaced at runtime during static - // initialization : - // %n - Will be replaced with a regex matching a decimal number. - private static $IgnoredInstructionTemplatesLayout = array - ( - '%n{6} ( (c) ) \s+', - '%n{4} ( (re) | (y) | (v) | (k) | (K) ) \s+', - '%n{3} ( (scn) | (SCN) | (r) | (rg) | (RG) | (sc) | (SC) ) \s+', - '%n{2} ( (m) | (l) ) \s+', - '%n ( (w) | (M) | (g) | (G) | (J) | (j) | (d) | (i) | (sc) | (SC) | (Tc) | (Tw) | (scn) | (Tr) | (Tz) | (Ts) ) \s+', - '\b ( (BDC) | (EMC) ) \s+', - '\/( (Cs \d+) | (CS \d+) | (G[Ss] \d+) | (Fm \d+) | (Im \d+) | (PlacedGraphic) ) \s+ \w+ \s*', - '\/( (Span) | (Artifact) | (Figure) | (P) ) \s* << .*? >> [ \t\r\n>]*', - '\/ ( (PlacedGraphic) | (Artifact) ) \s+', - '\d+ \s+ ( (scn) | (SCN) )', - '\/MC \d+ \s+', - '^ \s* [fhS] \r? \n', - '^W \s+ n \r? \n', - '(f | W) \* \s+', - '^[fhnS] \s+', - '-?0 (\. \d+)? \s+ T[cw]', - '\bBI \s+ .*? \bID \s+ .*? \bEI', - '\/ \w+ \s+ ( (cs) | (CS) | (ri) | (gs) )', - // Hazardous replaces ? - '( [Ww] \s+ ){3,}', - ' \[\] \s+ [Shs] \s+' - ) ; - // Additional instructions to be stripped when no particular page layout has been requested - private static $IgnoredInstructionTemplatesNoLayout = array - ( - '%n{6} ( (cm) ) \s+', -// '\b ( (BT) | (ET) ) \s+', - '^ \s* [Qq] \r? \n', - '^ \s* (\b [a-zA-Z] \s+)+', - '\s* (\b [a-zA-Z] \s+)+$', - '^[qQ] \s+', - '^q \s+ [hfS] \n', - '( [Qfhnq] \s+ ){2,}' - ) ; - // Replacement regular expressions for %something constructs specified in the $IgnoredInstructions array - private static $ReplacementConstructs = array - ( - '%n' => '( [+\-]? ( ( [0-9]+ ( \. [0-9]* )? ) | ( \. [0-9]+ ) ) \s+ )' - ) ; - // The final regexes that are built during static initialization by the __build_ignored_instructions() method - private static $IgnoredInstructionsNoLayout = array ( ) ; - private static $IgnoredInstructionsLayout = array ( ) ; - private $IgnoredInstructions = array ( ) ; - - // Map id buffer - for avoiding unneccesary calls to GetFontByMapId - private $MapIdBuffer = array ( ) ; - - // Same for MapCharacter() - private $CharacterMapBuffer = array ( ) ; - - // Font objects buffer - used by __assemble_text_fragments() - private $FontObjectsBuffer = array ( ) ; - - // Regex used for removing hyphens - we have to take care of different line endings : "\n" for Unix, "\r\n" - // for Windows, and "\r" for pure Mac files. - // Note that we replace an hyphen followed by an end-of-line then by non-space characters with the non-space - // characters, so the word gets joined on the same line. Spaces after the end of the word (on the next line) - // are removed, in order for the next word to appear at the beginning of the second line. - private static $RemoveHyphensRegex = '# - ( - - - [ \t]* ( (\r\n) | \n | \r )+ [ \t\r\n]* - ) - ([^ \t\r\n]+) - \s* - #msx' ; - - // A small list of Unicode character ranges that are related to languages written from right to left - // For performance reasons, everythings is mapped to a range here, even if it includes codepoints that do not map to anything - // (this class is not a Unicode codepoint validator, but a Pdf text extractor...) - // The UTF-16 version is given as comments ; only the UTF-8 translation is used here - // To be completed ! - private static $RtlCharacters = array - ( - // This range represents the following languages : - // - Hebrew (0590..05FF) - // - Arabic (0600..06FF) - // - Syriac (0700..074F) - // - Supplement for Arabic (0750..077F) - // - Thaana (0780..07BF) - // - N'ko (07C0..07FF) - // - Samaritan (0800..083F) - // - Mandaic (0840..085F) - // array ( 0x00590, 0x0085F ), - // Hebrew supplement (I suppose ?) + other characters - // array ( 0x0FB1D, 0x0FEFC ), - // Mende kikakui - // array ( 0x1E800, 0x1E8DF ), - // Adlam - // array ( 0x1E900, 0x1E95F ), - // Others - // array ( 0x10800, 0x10C48 ), - // array ( 0x1EE00, 0x1EEBB ) - "\xD6" => array ( array ( "\x90", "\xBF" ) ), - "\xD7" => array ( array ( "\x80", "\xBF" ) ), - "\xD8" => array ( array ( "\x80", "\xBF" ) ), - "\xD9" => array ( array ( "\x80", "\xBF" ) ), - "\xDA" => array ( array ( "\x80", "\xBF" ) ), - "\xDB" => array ( array ( "\x80", "\xBF" ) ), - "\xDC" => array ( array ( "\x80", "\xBF" ) ), - "\xDD" => array ( array ( "\x80", "\xBF" ) ), - "\xDE" => array ( array ( "\x80", "\xBF" ) ), - "\xDF" => array ( array ( "\x80", "\xBF" ) ) - /* - "\xE0" => array - ( - array ( "\xA0\x80", "\xA0\xBF" ), - array ( "\xA1\x80", "\xA1\x9F" ) - ), - "\xEF" => array - ( - array ( "\xAC\x9D", "\xAC\xBF" ), - array ( "\xAD\x80", "\xAD\xBF" ), - array ( "\xAE\x80", "\xAE\xBF" ), - array ( "\xAF\x80", "\xAF\xBF" ), - array ( "\xB0\x80", "\xB0\xBF" ), - array ( "\xB1\x80", "\xB1\xBF" ), - array ( "\xB2\x80", "\xB2\xBF" ), - array ( "\xB3\x80", "\xB3\xBF" ), - array ( "\xB4\x80", "\xB4\xBF" ), - array ( "\xB5\x80", "\xB5\xBF" ), - array ( "\xB6\x80", "\xB6\xBF" ), - array ( "\xB7\x80", "\xB7\xBF" ), - array ( "\xB8\x80", "\xB8\xBF" ), - array ( "\xB9\x80", "\xB9\xBF" ), - array ( "\xBA\x80", "\xBA\xBF" ), - array ( "\xBB\x80", "\xBB\xBC" ) - ) - */ - ) ; - - // UTF-8 prefixes for RTL characters as keys, and number of characters that must follow the prefix as values - private static $RtlCharacterPrefixLengths = array - ( - "\xD6" => 1, - "\xD7" => 1, - "\xD8" => 1, - "\xD9" => 1, - "\xDA" => 1, - "\xDB" => 1, - "\xDC" => 1, - "\xDE" => 1, - "\xDF" => 1 - /* - "\xE0" => 2, - "\xEF" => 2 - */ - ) ; - - // A string that contains all the RTL character prefixes above - private static $RtlCharacterPrefixes ; - - // As usual, caching a little bit the results of the IsRtlCharacter() method is welcome. Each item will have the value true if the - // character is RTL, or false if LTR. - private $RtlCharacterBuffer = array ( ) ; - - // A subset of a character classification array that avoids too many calls to the ctype_* functions or too many - // character comparisons. - // This array is used only for highly sollicited parts of code - const CTYPE_ALPHA = 0x01 ; // Letter - const CTYPE_DIGIT = 0x02 ; // Digit - const CTYPE_XDIGIT = 0x04 ; // Hex digit - const CTYPE_ALNUM = 0x08 ; // Letter or digit - const CTYPE_LOWER = 0x10 ; // Lower- or upper-case letters - const CTYPE_UPPER = 0x20 ; - - private static $CharacterClasses = false ; - - // Stuff specific to the current PHP version - private static $HasMemoryGetUsage ; - private static $HasMemoryGetPeakUsage ; - - - /*-------------------------------------------------------------------------------------------------------------- - - CONSTRUCTOR - $pdf = new PdfToText ( $filename = null, $options = PDFOPT_NONE ) ; - - DESCRIPTION - Builds a PdfToText object and optionally loads the specified file's contents. - - PARAMETERS - $filename (string) - - Optional PDF filename whose text contents are to be extracted. - - $options (integer) - - A combination of PDFOPT_* flags. This can be any of the following : - - - PDFOPT_REPEAT_SEPARATOR : - Text constructs specified as an array are separated by an offset which is expressed as - thousands of text units ; for example : - - [(1)-2000(2)] - - will be rendered as the text "1 2" ("1" and "2" being separated by two spaces) if the - "Separator" property is set to a space (the default) and this flag is specified. - When not specified, the text will be rendered as "1 2". - - - PDFOPT_NONE : - None of the above options will apply. - - *-------------------------------------------------------------------------------------------------------------*/ - public function __construct ( $filename = null, $options = self::PDFOPT_NONE, $user_password = false, $owner_password = false ) - { - // We need the mbstring PHP extension here... - if ( ! function_exists ( 'mb_convert_encoding' ) ) - error ( "You must enable the mbstring PHP extension to use this class." ) ; - - // Perform static initializations if needed - if ( ! self::$StaticInitialized ) - { - if ( self::$DEBUG ) - { - // In debug mode, initialize the utf8 placeholder only if it still set to its default value, the empty string - if ( self::$Utf8Placeholder == '' ) - self::$Utf8Placeholder = '[Unknown character : 0x%08X]' ; - } - - // Build the list of regular expressions from the list of ignored instruction templates - self::__build_ignored_instructions ( ) ; - - // Check if some functions are supported or not - self::$HasMemoryGetUsage = function_exists ( 'memory_get_usage' ) ; - self::$HasMemoryGetPeakUsage = function_exists ( 'memory_get_peak_usage' ) ; - - // Location of the directory containing CID fonts - self::$CIDTablesDirectory = dirname ( __FILE__ ) . DIRECTORY_SEPARATOR . 'CIDTables' ; - self::$FontMetricsDirectory = dirname ( __FILE__ ) . DIRECTORY_SEPARATOR . 'FontMetrics' ; - - // The string that contains all the Rtl character prefixes in UTF-8 - An optimization used by the __rtl_process() method - self::$RtlCharacterPrefixes = implode ( '', array_keys ( self::$RtlCharacterPrefixLengths ) ) ; - - // Build the character classes (used only for testing letters and digits) - if ( self::$CharacterClasses === false ) - { - for ( $ord = 0 ; $ord < 256 ; $ord ++ ) - { - $ch = chr ( $ord ) ; - - if ( $ch >= '0' && $ch <= '9' ) - self::$CharacterClasses [ $ch ] = self::CTYPE_DIGIT | self::CTYPE_XDIGIT | self::CTYPE_ALNUM ; - else if ( $ch >= 'A' && $ch <= 'Z' ) - { - self::$CharacterClasses [ $ch ] = self::CTYPE_ALPHA | self::CTYPE_UPPER | self::CTYPE_ALNUM ; - - if ( $ch <= 'F' ) - self::$CharacterClasses [ $ch ] |= self::CTYPE_XDIGIT ; - } - else if ( $ch >= 'a' && $ch <= 'z' ) - { - self::$CharacterClasses [ $ch ] = self::CTYPE_ALPHA | self::CTYPE_LOWER | self::CTYPE_ALNUM ; - - if ( $ch <= 'f' ) - self::$CharacterClasses [ $ch ] |= self::CTYPE_XDIGIT ; - } - else - self::$CharacterClasses [ $ch ] = 0 ; - } - } - - // Global execution time limit - self::$PhpMaxExecutionTime = ( integer ) ini_get ( 'max_execution_time' ) ; - - if ( ! self::$PhpMaxExecutionTime ) // Paranoia : default max script execution time to 120 seconds - self::$PhpMaxExecutionTime = 120 ; - - self::$GlobalExecutionStartTime = microtime ( true ) ; // Set the start of the first execution - - if ( self::$MaxGlobalExecutionTime > 0 ) - self::$AllowedGlobalExecutionTime = self::$MaxGlobalExecutionTime ; - else - self::$AllowedGlobalExecutionTime = self::$PhpMaxExecutionTime + self::$MaxGlobalExecutionTime ; - - // Adjust in case of inconsistent values - if ( self::$AllowedGlobalExecutionTime < 0 || self::$AllowedGlobalExecutionTime > self::$PhpMaxExecutionTime ) - self::$AllowedGlobalExecutionTime = self::$PhpMaxExecutionTime - 1 ; - - self::$StaticInitialized = true ; - } - - parent::__construct ( ) ; - - $this -> Options = $options ; - - if ( $filename ) - $this -> Load ( $filename, $user_password, $owner_password ) ; - } - - - public function __tostring ( ) - { return ( $this -> Text ) ; }oad - Loads text contents from a PDF file. - LoadFromString - Loads PDF contents from a string. - - PROTOTYPE - $text = $pdf -> Load ( $filename, $user_password = false, $owner_password = false ) ; - $text = $pdf -> LoadFromString ( $contents, $user_password = false, $owner_password = false ) ; - - DESCRIPTION - The Load() method extracts text contents from the specified PDF file. Once processed, text contents will - be available through the "Text" property. - The LoadFromString() method performs the same operation on PDF contents already loaded into memory. - - PARAMETERS - $filename (string) - - Optional PDF filename whose text contents are to be extracted. - - $contents (string) - - String containing PDF contents. - - $user_password (string) - - User password used for decrypting PDF contents. - - $owner_password (string) - - Owner password. - - *-------------------------------------------------------------------------------------------------------------*/ - private $__memory_peak_usage_start, - $__memory_usage_start ; - - public function Load ( $filename, $user_password = false, $owner_password = false ) - { - $this -> __memory_usage_start = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ; - $this -> __memory_peak_usage_start = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ; - - // Check if the file exists, but only if the file is on a local filesystem - if ( ! preg_match ( '#^ [^:]+ ://#ix', $filename ) && ! file_exists ( $filename ) ) - error ( new PdfToTextDecodingException ( "File \"$filename\" does not exist." ) ) ; - - // Load its contents - $contents = @file_get_contents ( $filename, FILE_BINARY ) ; - - if ( $contents === false ) - error ( new PdfToTextDecodingException ( "Unable to open \"$filename\"." ) ) ; - - return ( $this -> __load ( $filename, $contents, $user_password, $owner_password ) ) ; - } - - - public function LoadFromString ( $contents, $user_password = false, $owner_password = false ) - { - $this -> __memory_usage_start = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ; - $this -> __memory_peak_usage_start = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ; - - return ( $this -> __load ( '', $contents, $user_password, $owner_password ) ) ; - } - - - private function __load ( $filename, $contents, $user_password = false, $owner_password = false ) - { - // Search for the start of the document ("%PDF-x.y") - $start_offset = strpos ( $contents, '%PDF' ) ; - - if ( $start_offset === false ) // Not a pdf document ! - error ( new PdfToTextDecodingException ( "File \"$filename\" is not a valid PDF file." ) ) ; - else // May be a PDF document - $this -> DocumentStartOffset = $start_offset ; - - // Check that this is a PDF file with a valid version number - if ( ! preg_match ( '/ %PDF- (?P \d+ (\. \d+)*) /ix', $contents, $match, 0, $start_offset ) ) - error ( new PdfToTextDecodingException ( "File \"$filename\" is not a valid PDF file." ) ) ; - - $this -> PdfVersion = $match [ 'version' ] ; - - // Initializations - $this -> Text = '' ; - $this -> FontTable = new PdfTexterFontTable ( ) ; - $this -> Filename = realpath ( $filename ) ; - $this -> Pages = array ( ) ; - $this -> Images = array ( ) ; - $this -> ImageData = array ( ) ; - $this -> ImageCount = 0 ; - $this -> AutoSavedImageFiles = array ( ) ; - $this -> PageMap = new PdfTexterPageMap ( ) ; - $this -> PageLocations = array ( ) ; - $this -> Author = '' ; - $this -> CreatorApplication = '' ; - $this -> ProducerApplication = '' ; - $this -> CreationDate = '' ; - $this -> ModificationDate = '' ; - $this -> Title = '' ; - $this -> Subject = '' ; - $this -> Keywords = '' ; - $this -> GotAuthorInformation = false ; - $this -> ID = '' ; - $this -> ID2 = '' ; - $this -> EncryptionData = false ; - $this -> EnhancedStatistics = ( ( $this -> Options & self::PDFOPT_ENHANCED_STATISTICS ) != 0 ) ; - - // Also reset cached information that may come from previous runs - $this -> MapIdBuffer = array ( ) ; - $this -> RtlCharacterBuffer = array ( ) ; - $this -> CharacterMapBuffer = array ( ) ; - $this -> FontObjectsBuffer = array ( ) ; - $this -> FormData = array ( ) ; - $this -> FormDataObjectNumbers = false ; - $this -> FomDataDefinitions = array ( ) ; - $this -> FormDataObjects = array ( ) ; - $this -> CaptureDefinitions = false ; - $this -> CaptureObject = false ; - $this -> DocumentFragments = array ( ) ; - - // Enable the PDFOPT_BASIC_LAYOUT option if the PDFOPT_CAPTURE flag is specified - if ( $this -> Options & self::PDFOPT_CAPTURE ) - $this -> Options |= self::PDFOPT_BASIC_LAYOUT ; - - // Enable the PDFOPT_BASIC_LAYOUT_OPTION is PDFOPT_DEBUG_SHOW_COORDINATES is specified - if ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES ) - $this -> Options |= self::PDFOPT_BASIC_LAYOUT ; - - // Page layout options needs more instructions to be retained - select the appropriate list of useless instructions - if ( $this -> Options & self::PDFOPT_BASIC_LAYOUT ) - $this -> IgnoredInstructions = self::$IgnoredInstructionsLayout ; - else - $this -> IgnoredInstructions = self::$IgnoredInstructionsNoLayout ; - - - // Debug statistics - $this -> Statistics = array - ( - 'TextSize' => 0, // Total size of drawing instructions ("text" objects) - 'OptimizedTextSize' => 0, // Optimized text size, with useless instructions removed - 'Distributions' => array // Statistics about handled instructions distribution - Works only with the page layout option in debug mode - ( - 'operand' => 0, - 'Tm' => 0, - 'Td' => 0, - 'TD' => 0, - "'" => 0, - 'TJ' => 0, - 'Tj' => 0, - 'Tf' => 0, - 'TL' => 0, - 'T*' => 0, - '(' => 0, - '<' => 0, - '[' => 0, - 'cm' => 0, - 'BT' => 0, - 'template' => 0, - 'ignored' => 0, - 'space' => 0 - ) - ) ; - - // Per-instance execution time limit - $this -> ExecutionStartTime = microtime ( true ) ; - - if ( $this -> MaxExecutionTime > 0 ) - $this -> AllowedExecutionTime = $this -> MaxExecutionTime ; - else - $this -> AllowedExecutionTime = self::$PhpMaxExecutionTime + $this -> MaxExecutionTime ; - - // Adjust in case of inconsistent values - if ( $this -> AllowedExecutionTime < 0 || $this -> AllowedExecutionTime > self::$PhpMaxExecutionTime ) - $this -> AllowedExecutionTime = self::$PhpMaxExecutionTime - 1 ; - - // Systematically set the DECODE_IMAGE_DATA flag if the AUTOSAVE_IMAGES flag has been specified - if ( $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES ) - $this -> Options |= self::PDFOPT_DECODE_IMAGE_DATA ; - - // Systematically set the GET_IMAGE_DATA flag if DECODE_IMAGE_DATA is specified (debug mode only) - if ( self::$DEBUG && $this -> Options & self::PDFOPT_DECODE_IMAGE_DATA ) - $this -> Options |= self::PDFOPT_GET_IMAGE_DATA ; - - // Since page layout options take 2 bits, but not all of the 4 possible values are allowed, make sure that an invalid - // value will default to PDFOPT_RAW_LAYOUT value - $layout_option = $this -> Options & self::PDFOPT_LAYOUT_MASK ; - - if ( ! $layout_option === self::PDFOPT_RAW_LAYOUT && $layout_option !== self::PDFOPT_BASIC_LAYOUT ) - { - $layout_option = self::PDFOPT_RAW_LAYOUT ; - $this -> Options = ( $this -> Options & ~self::PDFOPT_LAYOUT_MASK ) | self::PDFOPT_RAW_LAYOUT ; - } - - // Author information needs to be processed after, because it may reference objects that occur later in the PDF stream - $author_information_object_id = false ; - - // Extract pdf objects that are enclosed by the "obj" and "endobj" keywords - $pdf_objects = array ( ) ; - $contents_offset = $this -> DocumentStartOffset ; - $contents_length = strlen ( $contents ) ; - - - while ( $contents_offset < $contents_length && - preg_match ( '/(?P (?P \d+) \s+ \d+ \s+ obj (?P .*?) endobj )/imsx', $contents, $match, PREG_OFFSET_CAPTURE, $contents_offset ) ) - { - $object_number = $match [ 'object_id' ] [0] ; - $object_data = $match [ 'object' ] [0] ; - - // Handle the special case of object streams (compound objects) - // They are not added in the $pdf_objects array, because they could be mistakenly processed as relevant information, - // such as font definitions, etc. - // Instead, only the objects they are embedding are stored in this array. - if ( $this -> IsObjectStream ( $object_data ) ) - { - // Ignore ill-formed object streams - if ( ( $object_stream_matches = $this -> DecodeObjectStream ( $object_number, $object_data ) ) !== false ) - { - // Add this list of objects to the list of known objects - for ( $j = 0, $object_stream_count = count ( $object_stream_matches [ 'object_id' ] ) ; $j < $object_stream_count ; $j ++ ) - $pdf_objects [ $object_stream_matches [ 'object_id' ] [$j] ] = $object_stream_matches [ 'object' ] [$j] ; - } - } - // Normal (non-compound) object - else - $pdf_objects [ $object_number ] = $object_data ; - - // Update current offset through PDF contents - $contents_offset = $match [ 're' ] [1] + strlen ( $match [ 're' ] [0] ) ; - } - - // We put a particular attention in treating errors returned by preg_match_all() here, since we need to be really sure why stopped - // to find further PDF objects in the supplied contents - $preg_error = preg_last_error ( ) ; - - switch ( $preg_error ) - { - case PREG_NO_ERROR : - break ; - - case PREG_INTERNAL_ERROR : - error ( new PdfToTextDecodingException ( "PDF object extraction : the preg_match_all() function encountered an internal error." ) ) ; - - case PREG_BACKTRACK_LIMIT_ERROR : - error ( new PdfToTextDecodingException ( "PDF object extraction : backtrack limit reached (you may have to modify the pcre.backtrack_limit " . - "setting of your PHP.ini file, which is currently set to " . ini_get ( 'pcre.backtrack_limit' ) . ")." ) ) ; - - case PREG_JIT_STACKLIMIT_ERROR : - error ( new PdfToTextDecodingException ( "PDF object extraction : JIT stack limit reached (you may disable this feature by setting the pcre.jit " . - "setting of your PHP.ini file to 0)." ) ) ; - - case PREG_RECURSION_LIMIT_ERROR : - error ( new PdfToTextDecodingException ( "PDF object extraction : recursion limit reached (you may have to modify the pcre.recursion_limit " . - "setting of your PHP.ini file, which is currently set to " . ini_get ( 'pcre.recursion_limit' ) . ")." ) ) ; - - case PREG_BAD_UTF8_ERROR : - error ( new PdfToTextDecodingException ( "PDF object extraction : bad UTF8 character encountered." ) ) ; - - case PREG_BAD_UTF8_OFFSET_ERROR : - error ( new PdfToTextDecodingException ( "PDF object extraction : the specified offset does not start at the beginning of a valid UTF8 codepoint." ) ) ; - - default : - error ( new PdfToTextDecodingException ( "PDF object extraction : unkown PREG error #$preg_error" ) ) ; - } - - - // Extract trailer information, which may contain the ID of an object specifying encryption flags - $this -> GetTrailerInformation ( $contents, $pdf_objects ) ; - unset ( $contents ) ; - - // Character maps encountered so far - $cmaps = array ( ) ; - - // An array that will store object ids as keys and text contents as values - $text = array ( ) ; - - // Loop through the objects - foreach ( $pdf_objects as $object_number => $object_data ) - { - // Some additional objects may be uncovered after processing (in an object containing compacted objects for example) - // so add them to the list if necessary - if ( ! isset ( $pdf_objects [ $object_number ] ) ) - $pdf_objects [ $object_number ] = $object_data ; - - // Try to catch information related to page mapping - but don't discard the object since it can contain additional information - $this -> PageMap -> Peek ( $object_number, $object_data, $pdf_objects ) ; - - // Check if the object contais authoring information - it can appear encoded or unencoded - if ( ! $this -> GotAuthorInformation ) - $author_information_object_id = $this -> PeekAuthorInformation ( $object_number, $object_data ) ; - - // Also catch the object encoding type - $type = $this -> GetEncodingType ( $object_number, $object_data ) ; - $stream_match = null ; - - if ( strpos ( $object_data, 'stream' ) === false || - ! preg_match ( '#[^/] stream \s+ (?P .*?) endstream#imsx', $object_data, $stream_match ) ) - { - // Some font definitions are in clear text in an object, some are encoded in a stream within the object - // We process here the unencoded ones - if ( $this -> IsFont ( $object_data ) ) - { - $this -> FontTable -> Add ( $object_number, $object_data, $pdf_objects, $this -> AdobeExtraMappings ) ; - continue ; - } - // Some character maps may also be in clear text - else if ( $this -> IsCharacterMap ( $object_data ) ) - { - $cmap = PdfTexterCharacterMap::CreateInstance ( $object_number, $object_data, $this -> AdobeExtraMappings ) ; - - if ( $cmap ) - $cmaps [] = $cmap ; - - continue ; - } - // Check if there is an association between font number and object number - else if ( $this -> IsFontMap ( $object_data ) ) - { - $this -> FontTable -> AddFontMap ( $object_number, $object_data ) ; - } - // Retrieve form data if present - else if ( $this -> IsFormData ( $object_data ) ) - { - $this -> RetrieveFormData ( $object_number, $object_data, $pdf_objects ) ; - } - // Ignore other objects that do not contain an encoded stream - else - { - if ( self::$DEBUG > 1 ) - echo "\n----------------------------------- UNSTREAMED #$object_number\n$object_data" ; - - continue ; - } - } - // Extract image data, if any - else if ( $this -> IsImage ( $object_data ) ) - { - $this -> AddImage ( $object_number, $stream_match [ 'stream' ], $type, $object_data ) ; - continue ; - } - // Check if there is an association between font number and object number - else if ( $this -> IsFontMap ( $object_data ) ) - { - $this -> FontTable -> AddFontMap ( $object_number, $object_data ) ; - - if ( ! $stream_match ) - continue ; - } - - // Check if the stream contains data (yes, I have found a sample that had streams of length 0...) - // In other words : ignore empty streams - if ( stripos ( $object_data, '/Length 0' ) !== false ) - continue ; - - // Isolate stream data and try to find its encoding type - if ( isset ( $stream_match [ 'stream' ] ) ) - $stream_data = ltrim ( $stream_match [ 'stream' ], "\r\n" ) ; - else - continue ; - - // Ignore this stream if the object does not contain an encoding type (/FLATEDECODE, /ASCIIHEX or /ASCII85) - if ( $type == self::PDF_UNKNOWN_ENCODING ) - { - if ( self::$DEBUG > 1 ) - echo "\n----------------------------------- UNENCODED #$object_number :\n$object_data" ; - - continue ; - } - - // Decode the encoded stream - $decoded_stream_data = $this -> DecodeData ( $object_number, $stream_data, $type, $object_data ) ; - - // Second chance to peek author information, this time on a decoded stream data - if ( ! $this -> GotAuthorInformation ) - $author_information_object_id = $this -> PeekAuthorInformation ( $object_number, $decoded_stream_data ) ; - - // Check for character maps - if ( $this -> IsCharacterMap ( $decoded_stream_data ) ) - { - $cmap = PdfTexterCharacterMap::CreateInstance ( $object_number, $decoded_stream_data, $this -> AdobeExtraMappings ) ; - - if ( $cmap ) - $cmaps [] = $cmap ; - } - // Font definitions - else if ( $this -> IsFont ( $decoded_stream_data ) ) - { - $this -> FontTable -> Add ( $object_number, $decoded_stream_data, $pdf_objects, $this -> AdobeExtraMappings ) ; - } - // Retrieve form data if present - else if ( $this -> IsFormData ( $object_data ) ) - { - $this -> RetrieveFormData ( $object_number, $decoded_stream_data, $pdf_objects ) ; - } - // Plain text (well, in fact PDF drawing instructions) - else if ( $this -> IsText ( $object_data, $decoded_stream_data ) ) - { - $text_data = false ; - - // Check if we need to ignore page headers and footers - if ( $this -> Options & self::PDFOPT_IGNORE_HEADERS_AND_FOOTERS ) - { - if ( ! $this -> IsPageHeaderOrFooter ( $decoded_stream_data ) ) - { - $text [ $object_number ] = - $text_data = $decoded_stream_data ; - } - // However, they may be mixed with actual text contents so we need to separate them... - else - { - $this -> ExtractTextData ( $object_number, $decoded_stream_data, $remainder, $header, $footer ) ; - - // We still need to check again that the extracted text portion contains something useful - if ( $this -> IsText ( $object_data, $remainder ) ) - { - $text [ $object_number ] = - $text_data = $remainder ; - } - } - } - else - { - $text [ $object_number ] = - $text_data = $decoded_stream_data ; - } - - - // The current object may be a text object that have been defined as an XObject in some other object - // In this case, we have to keep it since it may be referenced by a /TPLx construct from within - // another text object - if ( $text_data ) - $this -> PageMap -> AddTemplateObject ( $object_number, $text_data ) ; - } - // This may be here the opportunity to look into the $FormData property and replace object ids with their corresponding data - else - { - $found = false ; - - foreach ( $this -> FormData as &$form_entry ) - { - if ( is_integer ( $form_entry [ 'values' ] ) && $object_number == $form_entry [ 'values' ] ) - { - $form_entry [ 'values' ] = $decoded_stream_data ; - $found = true ; - } - else if ( is_integer ( $form_entry [ 'form' ] ) && $object_number == $form_entry [ 'form' ] ) - { - $form_entry [ 'form' ] = $decoded_stream_data ; - $found = true ; - } - } - - if ( ! $found && self::$DEBUG > 1 ) - echo "\n----------------------------------- UNRECOGNIZED #$object_number :\n$decoded_stream_data\n" ; - } - } - - // Form data object numbers - $this -> FormDataObjectNumbers = array_keys ( $this -> FormData ) ; - - // Associate character maps with declared fonts - foreach ( $cmaps as $cmap ) - $this -> FontTable -> AddCharacterMap ( $cmap ) ; - - // Current font defaults to -1, which means : take the first available font as the current one. - // Sometimes it may happen that text drawing instructions do not set a font at all (PdfPro for example) - $current_font = -1 ; - - // Build the page catalog - $this -> Pages = array ( ) ; - $this -> PageMap -> MapObjects ( $text ) ; - - // Add font mappings local to each page - $mapped_fonts = $this -> PageMap -> GetMappedFonts ( ) ; - $this -> FontTable -> AddPageFontMap ( $mapped_fonts ) ; - - // Extract text from the collected text elements - foreach ( $this -> PageMap -> Pages as $page_number => $page_objects ) - { - // Checks if this page is selected - if ( ! $this -> IsPageSelected ( $page_number ) ) - continue ; - - $this -> Pages [ $page_number ] = '' ; - - if ( $layout_option === self::PDFOPT_RAW_LAYOUT ) - { - foreach ( $page_objects as $page_object ) - { - if ( isset ( $text [ $page_object ] ) ) - { - $new_text = $this -> PageMap -> ProcessTemplateReferences ( $page_number, $text [ $page_object ] ) ; - $object_text = $this -> ExtractText ( $page_number, $page_object, $new_text, $current_font ) ; - $this -> Pages [ $page_number ] .= $object_text ; - } - else if ( self::$DEBUG > 1 ) - echo "\n----------------------------------- MISSING OBJECT #$page_object for page #$page_number\n" ; - } - } - // New style (basic) layout rendering - else if ( $layout_option === self::PDFOPT_BASIC_LAYOUT ) - { - $page_fragments = array ( ) ; - - foreach ( $page_objects as $page_object ) - { - if ( isset ( $text [ $page_object ] ) ) - { - $new_text = $this -> PageMap -> ProcessTemplateReferences ( $page_number, $text [ $page_object ] ) ; - $this -> ExtractTextWithLayout ( $page_fragments, $page_number, $page_object, $new_text, $current_font ) ; - } - else if ( self::$DEBUG > 1 ) - echo "\n----------------------------------- MISSING OBJECT #$page_object for page #$page_number\n" ; - } - - $this -> Pages [ $page_number ] = $this -> __assemble_text_fragments ( $page_number, $page_fragments, $page_width, $page_height ) ; - - $this -> DocumentFragments [ $page_number ] = array - ( - 'fragments' => $page_fragments, - 'page-width' => $page_width, - 'page_height' => $page_height - ) ; - } - } - - // Retrieve author information - if ( $this -> GotAuthorInformation ) - $this -> RetrieveAuthorInformation ( $author_information_object_id, $pdf_objects ) ; - - // Build the page locations (ie, starting and ending offsets) - $offset = 0 ; - $page_separator = utf8_encode ( $this -> PageSeparator ) ; - $page_separator_length = strlen ( $page_separator ) ; - - foreach ( $this -> Pages as $page_number => &$page ) - { - // If hyphenated words are unwanted, then remove them - if ( $this -> Options & self::PDFOPT_NO_HYPHENATED_WORDS ) - $page = preg_replace ( self::$RemoveHyphensRegex, '$4$2', $page ) ; - - $length = strlen ( $page ) ; - $this -> PageLocations [ $page_number ] = array ( 'start' => $offset, 'end' => $offset + $length - 1 ) ; - $offset += $length + $page_separator_length ; - } - - // And finally, the Text property - $this -> Text = implode ( $page_separator, $this -> Pages ) ; - - // Free memory - $this -> MapIdBuffer = array ( ) ; - $this -> RtlCharacterBuffer = array ( ) ; - $this -> CharacterMapBuffer = array ( ) ; - - // Compute memory occupied for this file - $memory_usage_end = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ; - $memory_peak_usage_end = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ; - - $this -> MemoryUsage = $memory_usage_end - $this -> __memory_usage_start ; - $this -> MemoryPeakUsage = $memory_peak_usage_end - $this -> __memory_peak_usage_start ; - - // Adjust the "Distributions" statistics - if ( $this -> Options & self::PDFOPT_ENHANCED_STATISTICS ) - { - $instruction_count = 0 ; - $statistics = array ( ) ; - - // Count the total number of instructions - foreach ( $this -> Statistics [ 'Distributions' ] as $count ) - $instruction_count += $count ; - - // Now transform the Distributions entries into an associative array containing the instruction counts - // ('count') and their relative percentage - foreach ( $this -> Statistics [ 'Distributions' ] as $name => $count ) - { - if ( $instruction_count ) - $percent = round ( ( 100.0 / $instruction_count ) * $count, 2 ) ; - else - $percent = 0 ; - - $statistics [ $name ] = array - ( - 'instruction' => $name, - 'count' => $count, - 'percent' => $percent - ) ; - } - - // Set the new 'Distributions' array and sort it by instruction count in reverse order - $this -> Statistics [ 'Distributions' ] = $statistics ; - uksort ( $this -> Statistics [ 'Distributions' ], array ( $this, '__sort_distributions' ) ) ; - } - - // All done, return - return ( $this -> Text ) ; - } - - - public function __sort_distributions ( $a, $b ) - { return ( $this -> Statistics [ 'Distributions' ] [$b] [ 'count' ] - $this -> Statistics [ 'Distributions' ] [$a] [ 'count' ] ) ; } - - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - AddAdobeExtraMappings - Adds extra mappings for standard Adobe fonts. - - PROTOTYPE - $pdf -> AddAdobeExtraMappings ( $mappings ) ; - - DESCRIPTION - Adobe supports 4 predefined fonts : standard, Mac, WinAnsi and PDF). All the characters in these fonts - are identified by a character time, a little bit like HTML entities ; for example, 'one' will be the - character '1', 'acircumflex' will be '', etc. - There are thousands of character names defined by Adobe (see https://mupdf.com/docs/browse/source/pdf/pdf-glyphlist.h.html). - Some of them are not in this list ; this is the case for example of the 'ax' character names, where 'x' - is a decimal number. When such a character is specified in a /Differences array, then there is somewhere - a CharProc[] array giving an object id for each of those characters. - The referenced object(s) in turn contain drawing instructions to draw the glyph. At no point you could - guess what is the corresponding Unicode character for this glyph, since the information is not contained - in the PDF file. - The AddAdobeExtraMappings() method allows you to specify such correspondences. Specify an array as the - $mappings parameter, whose keys are the Adobe character name (for example, "a127") and values the - corresponding Unicode values (see the description of the $mappings parameter for more information). - - PARAMETERS - $mappings (associative array) - - Associative array whose keys are Adobe character names. The array values can take several forms : - - A character - - An integer value - - An array of up to four character or integer values. - Internally, every specified value is converted to an array of four integer values, one for - each of the standard Adobe character sets (Standard, Mac, WinAnsi and PDF). The following - rules apply : - - If the input value is a single character, the output array corrsponding the Adobe character - name will be a set of 4 elements corresponding to the ordinal value of the supplied - character. - - If the input value is an integer, the output array will be a set of 4 identical values - - If the input value is an array : - . Arrays with less that 4 elements will be padded, using the last array item for padding - . Arrays with more than 4 elements will be silently truncated - . Each array value can either be a character or a numeric value. - - NOTES - In this current implementation, the method applies the mappings to ALL Adobe default fonts. That is, - you cannot have one mapping for one Adobe font referenced in the PDF file, then a second mapping for - a second Adobe font, etc. - - *-------------------------------------------------------------------------------------------------------------*/ - public function AddAdobeExtraMappings ( $mappings ) - { - // Loop through each mapping - foreach ( $mappings as $key => $value ) - { - // Character value : we retain its ordinal value as the 4 values of the output array - if ( is_string ( $value ) ) - { - $ord = ord ( $value ) ; - $items = array ( $ord, $ord, $ord, $ord ) ; - } - // Numeric value : the output array will contain 4 times the supplied value - else if ( is_numeric ( $value ) ) - { - $value = ( integer ) $value ; - $items = array ( $value, $value, $value, $value ) ; - } - // Array value : make sure we will have an output array of 4 values - else if ( is_array ( $value ) ) - { - $items = array ( ) ; - - // Collect the supplied values, converting characters to their ordinal values if necessary - for ( $i = 0, $count = count ( $value ) ; $i < $count && $i < 4 ; $i ++ ) - { - $code = $value [$i] ; - - if ( is_string ( $code ) ) - $items [] = ord ( $code ) ; - else - $items [] = ( integer ) $code ; - } - - // Ensure that we have 4 values ; fill the missing ones with the last seen value if necessary - $count = count ( $items ) ; - - if ( ! $count ) - error ( new PdfToTextException ( "Adobe extra mapping \"$key\" has no values." ) ) ; - - $last_value = $items [ $count - 1 ] ; - - for ( $i = $count ; $i < 4 ; $i ++ ) - $items [] = $last_value ; - } - else - error ( new PdfToTextException ( "Invalid value \"$value\" for Adobe extra mapping \"$key\"." ) ) ; - - // Add this current mapping to the Adobe extra mappings array - $this -> AdobeExtraMappings [ $key ] = $items ; - } - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - GetPageFromOffset - Returns a page number from a text offset. - - PROTOTYPE - $offset = $pdf -> GetPageFromOffset ( $offset ) ; - - DESCRIPTION - Given a byte offset in the Text property, returns its page number in the pdf document. - - PARAMETERS - $offset (integer) - - Offset, in the Text property, whose page number is to be retrieved. - - RETURN VALUE - Returns a page number in the pdf document, or false if the specified offset does not exist. - - *-------------------------------------------------------------------------------------------------------------*/ - public function GetPageFromOffset ( $offset ) - { - if ( $offset === false ) - return ( false ) ; - - foreach ( $this -> PageLocations as $page => $location ) - { - if ( $offset >= $location [ 'start' ] && $offset <= $location [ 'end' ] ) - return ( $page ) ; - } - - return ( false ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - text_strpos, text_stripos - Search for an occurrence of a string. - - PROTOTYPE - $result = $pdf -> text_strpos ( $search, $start = 0 ) ; - $result = $pdf -> text_stripos ( $search, $start = 0 ) ; - - DESCRIPTION - These methods behave as the strpos/stripos PHP functions, except that : - - They operate on the text contents of the pdf file (Text property) - - They return an array containing the page number and text offset. $result [0] will be set to the page - number of the searched text, and $result [1] to its offset in the Text property - - PARAMETERS - $search (string) - - String to be searched. - - $start (integer) - - Start offset in the pdf text contents. - - RETURN VALUE - Returns an array of two values containing the page number and text offset if the searched string has - been found, or false otherwise. - - *-------------------------------------------------------------------------------------------------------------*/ - public function text_strpos ( $search, $start = 0 ) - { - $offset = mb_strpos ( $this -> Text, $search, $start, 'UTF-8' ) ; - - if ( $offset !== false ) - return ( array ( $this -> GetPageFromOffset ( $offset ), $offset ) ) ; - - return ( false ) ; - } - - - public function text_stripos ( $search, $start = 0 ) - { - $offset = mb_stripos ( $this -> Text, $search, $start, 'UTF-8' ) ; - - if ( $offset !== false ) - return ( array ( $this -> GetPageFromOffset ( $offset ), $offset ) ) ; - - return ( false ) ; - } - - - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - document_strpos, document_stripos - Search for all occurrences of a string. - - PROTOTYPE - $result = $pdf -> document_strpos ( $search, $group_by_page = false ) ; - $result = $pdf -> document_stripos ( $search, $group_by_page = false ) ; - - DESCRIPTION - Searches for ALL occurrences of a given string in the pdf document. The value of the $group_by_page - parameter determines how the results are returned : - - When true, the returned value will be an associative array whose keys will be page numbers and values - arrays of offset of the found string within the page - - When false, the returned value will be an array of arrays containing two entries : the page number - and the text offset. - - For example, if a pdf document contains the string "here" at character offset 100 and 200 in page 1, and - position 157 in page 3, the returned value will be : - - When $group_by_page is false : - [ [ 1, 100 ], [ 1, 200 ], [ 3, 157 ] ] - - When $group_by_page is true : - [ 1 => [ 100, 200 ], 3 => [ 157 ] ] - - PARAMETERS - $search (string) - - String to be searched. - - $group_by_page (boolean) - - Indicates whether the found offsets should be grouped by page number or not. - - RETURN VALUE - Returns an array of page numbers/character offsets (see Description above) or false if the specified - string does not appear in the document. - - *-------------------------------------------------------------------------------------------------------------*/ - public function document_strpos ( $text, $group_by_page = false ) - { - $length = strlen ( $text ) ; - - if ( ! $length ) - return ( false ) ; - - $result = array ( ) ; - $index = 0 ; - - while ( ( $index = mb_strpos ( $this -> Text, $text, $index, 'UTF-8' ) ) !== false ) - { - $page = $this -> GetPageFromOffset ( $index ) ; - - if ( $group_by_page ) - $result [ $page ] [] = $index ; - else - $result [] = array ( $page, $index ) ; - - $index += $length ; - } - - return ( $result ) ; - } - - - public function document_stripos ( $text, $group_by_page = false ) - { - $length = strlen ( $text ) ; - - if ( ! $length ) - return ( false ) ; - - $result = array ( ) ; - $index = 0 ; - - while ( ( $index = mb_stripos ( $this -> Text, $text, $index, 'UTF-8' ) ) !== false ) - { - $page = $this -> GetPageFromOffset ( $index ) ; - - if ( $group_by_page ) - $result [ $page ] [] = $index ; - else - $result [] = array ( $page, $index ) ; - - $index += $length ; - } - - return ( $result ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - text_match, document_match - Search string using regular expressions. - - PROTOTYPE - $status = $pdf -> text_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) ; - $status = $pdf -> document_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) ; - - DESCRIPTION - text_match() calls the preg_match() PHP function on the pdf text contents, to locate the first occurrence - of text that matches the specified regular expression. - document_match() calls the preg_match_all() function to locate all occurrences that match the specified - regular expression. - Note that both methods add the PREG_OFFSET_CAPTURE flag when calling preg_match/preg_match_all so you - should be aware that all captured results are an array containing the following entries : - - Item [0] is the captured string - - Item [1] is its text offset - - The text_match() and document_match() methods add an extra array item (index 2), which contains the - page number where the matched text resides - - PARAMETERS - $pattern (string) - - Regular expression to be searched. - - $match (any) - - Output captures. See preg_match/preg_match_all. - - $flags (integer) - - PCRE flags. See preg_match/preg_match_all. - - $offset (integer) - - Start offset. See preg_match/preg_match_all. - - RETURN VALUE - Returns the number of matched occurrences, or false if the specified regular expression is invalid. - - *-------------------------------------------------------------------------------------------------------------*/ - public function text_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) - { - $local_match = null ; - $status = preg_match ( $pattern, $this -> Text, $local_match, $flags | PREG_OFFSET_CAPTURE, $offset ) ; - - if ( $status ) - { - foreach ( $local_match as &$entry ) - $entry [2] = $this -> GetPageFromOffset ( $entry [1] ) ; - - $match = $local_match ; - } - - return ( $status ) ; - } - - - public function document_match ( $pattern, &$matches = null, $flags = 0, $offset = 0 ) - { - $local_matches = null ; - $status = preg_match_all ( $pattern, $this -> Text, $local_matches, $flags | PREG_OFFSET_CAPTURE, $offset ) ; - - if ( $status ) - { - foreach ( $local_matches as &$entry ) - { - foreach ( $entry as &$subentry ) - $subentry [2] = $this -> GetPageFromOffset ( $subentry [1] ) ; - } - - $matches = $local_matches ; - } - - return ( $status ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - HasFormData - - Returns true if the PDF file contains form data or not. - - *-------------------------------------------------------------------------------------------------------------*/ - public function HasFormData ( ) - { - return ( count ( $this -> FormData ) > 0 ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - GetFormCount - - Returns the number of top-level forms contained in the PDF file. - - *-------------------------------------------------------------------------------------------------------------*/ - public function GetFormCount ( ) - { - return ( count ( $this -> FormData ) ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - GetFormData - Returns form data, if any - - PROTOTYPE - $object = $pdf -> GetFormData ( $template = null, $form_index = 0 ) ; - - DESCRIPTION - Retrieves form data if present. - - PARAMETERS - $template (string) - - An XML file describing form data using human-readable names for field values. - If not specified, the inline form definitions will be used, together with the field names - specified in the PDF file. - - $form_index (integer) - - Form index in the PDF file. So far, I really don't know if a PDF file can have multiple forms. - - RETURN VALUE - An object derived from the PdfToTextFormData class. - - *-------------------------------------------------------------------------------------------------------------*/ - public function GetFormData ( $template = null, $form_index = 0 ) - { - if ( isset ( $this -> FormDataObjects [ $form_index ] ) ) - return ( $this -> FormDataObjects [ $form_index ] ) ; - - if ( $form_index > count ( $this -> FormDataObjectNumbers ) ) - error ( new PdfToTextFormException ( "Invalid form index #$form_index." ) ) ; - - $form_data = $this -> FormData [ $this -> FormDataObjectNumbers [ $form_index ] ] ; - - if ( $template ) - { - if ( ! file_exists ( $template ) ) - error ( new PdfToTextFormException ( "Form data template file \"$template\" not found." ) ) ; - - $xml_data = file_get_contents ( $template ) ; - $definitions = new PdfToTextFormDefinitions ( $xml_data, $form_data [ 'form' ] ) ; ; - } - else - { - $definitions = new PdfToTextFormDefinitions ( null, $form_data [ 'form' ] ) ; - } - - $object = $definitions [ $form_index ] -> GetFormDataFromPdfObject ( $form_data [ 'values' ] ) ; - - $this -> FormDataDefinitions [] = $definitions ; - $this -> FormDataObjects [] = $object ; - - return ( $object ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - MarkTextLike - Marks output text. - - PROTOTYPE - $pdf -> MarkTextLike ( $regex, $marker_start, $marker_end ) ; - - DESCRIPTION - Sometimes it may be convenient, when you want to extract only a portion of text, to say : "I want to - extract text between this title and this title". The MarkTextLike() method provides some support for - such a task. Imagine you have documents that have the same structure, all starting with an "Introduction" - title : - - Introduction - ... - some text - ... - Some other title - ... - - By calling the MarkTextLike() method such as in the example below : - - $pdf -> MarkTextLike ( '/\bIntroduction\b/', '', 'Introduction - ... - some text - ... - Some other title - - Adding such markers in the output will allow you to easily extract the text between the chapters - "Introduction" and "Some other title", using a regular expression. - - The font name used for the first string matched by the specified regular expression will be searched - later to add markers around all the text portions using this font. - - - PARAMETERS - $regex (string) - - A regular expression to match the text to be matched. Subsequent portions of text using the - same font will be surrounded by the marker start/end strings. - - $marker_start, $marker_end (string) - - Markers to surround the string when a match is found. - - *-------------------------------------------------------------------------------------------------------------*/ - public function MarkTextLike ( $regex, $marker_start, $marker_end ) - { - $this -> UnprocessedMarkerList [ 'font' ] [] = array - ( - 'regex' => $regex, - 'start' => $marker_start, - 'end' => $marker_end - ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - SetCaptures, SetCapturesFromString - Defines document parts to be captured. - - PROTOTYPE - $pdf -> SetCaptures ( $xml_file ) ; - $pdf -> SetCapturesFromString ( $xml_data ) ; - - DESCRIPTION - Defines document parts to be captured. - SetCaptures() takes the definitions for the areas to be captured from an XML file, while - SetCapturesFromString() takes them from a string representing xml capture definitions. - - NOTES - - See file README.md for an explanation on the format of the XML capture definition file. - - The SetCaptures() methods must be called before the Load() method. - - *-------------------------------------------------------------------------------------------------------------*/ - public function SetCaptures ( $xml_file ) - { - if ( ! file_exists ( $xml_file ) ) - error ( new PdfToTextException ( "File \"$xml_file\" does not exist." ) ) ; - - $xml_data = file_get_contents ( $xml_file ) ; - - $this -> SetCapturesFromString ( $xml_data ) ; - - } - - - public function SetCapturesFromString ( $xml_data ) - { - // Setting capture areas implies having the PDFOPT_BASIC_LAYOUT option - $this -> Options |= self::PDFOPT_BASIC_LAYOUT ; - - $this -> CaptureDefinitions = new PdfToTextCaptureDefinitions ( $xml_data ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - GetCaptures - Returns captured data. - - PROTOTYPE - $object = $pdf -> GetCaptures ( $full = false ) ; - - PARAMETERS - $full (boolean) - - When true, the whole captures, togethers with their definitions, are returned. When false, - only a basic object containing the capture names and their values is returned. - - DESCRIPTION - Returns the object that contains captured data. - - RETURN VALUE - An object of type PdfToTextCaptures, or false if an error occurred. - - *-------------------------------------------------------------------------------------------------------------*/ - public function GetCaptures ( $full = false ) - { - if ( ! $this -> CaptureObject ) - { - $this -> CaptureDefinitions -> SetPageCount ( count ( $this -> Pages ) ) ; - $this -> CaptureObject = $this -> CaptureDefinitions -> GetCapturedObject ( $this -> DocumentFragments ) ; - } - - if ( $full ) - return ( $this -> CaptureObject ) ; - else - return ( $this -> CaptureObject -> ToCaptures ( ) ) ; - }ddImage - Adds an image from the PDF stream to the current object. - - PROTOTYPE - $this -> AddImage ( $object_id, $stream_data, $type, $object_data ) ; - - DESCRIPTION - Adds an image from the PDF stream to the current object. - If the PDFOPT_GET_IMAGE_DATA flag is enabled, image data will be added to the ImageData property. - If the PDFOPT_DECODE_IMAGE_DATA flag is enabled, a jpeg resource will be created and added into the - Images array property. - - PARAMETERS - $object_id (integer) - - Pdf object id. - - $stream_data (string) - - Contents of the unprocessed stream data containing the image. - - $type (integer) - - One of the PdfToText::PDF_*_ENCODING constants. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function AddImage ( $object_id, $stream_data, $type, $object_data ) - { - - if ( self::$DEBUG && $this -> Options & self::PDFOPT_GET_IMAGE_DATA ) - { - switch ( $type ) - { - case self::PDF_DCT_ENCODING : - $this -> ImageData = array ( 'type' => 'jpeg', 'data' => $stream_data ) ; - break ; - } - - } - - - if ( $this -> Options & self::PDFOPT_DECODE_IMAGE_DATA && - ( ! $this -> MaxExtractedImages || $this -> ImageCount < $this -> MaxExtractedImages ) ) - { - $image = $this -> DecodeImage ( $object_id, $stream_data, $type, $object_data, $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES ) ; - - if ( $image !== false ) - { - $this -> ImageCount ++ ; - - // When the PDFOPT_AUTOSAVE_IMAGES flag is set, we simply use a template filename to generate a real output filename - // then save the image to that file. The memory is freed after that. - if ( $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES ) - { - $output_filename = $this -> __get_output_image_filename ( ) ; - - $image -> SaveAs ( $output_filename, $this -> ImageAutoSaveFormat ) ; - unset ( $image ) ; - - $this -> AutoSavedImageFiles [] = $output_filename ; - } - // Otherwise, simply store the image data into memory - else - $this -> Images [] = $image ; - } - } - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - DecodeData - Decodes stream data. - - PROTOTYPE - $data = $this -> DecodeData ( $object_id, $stream_data, $type ) ; - - DESCRIPTION - Decodes stream data (binary data located between the "stream" and "enstream" directives) according to the - specified encoding type, given in the surrounding object parameters. - - PARAMETERS - $object_id (integer) - - Id of the object containing the data. - - $stream_data (string) - - Contents of the binary stream. - - $type (integer) - - One of the PDF_*_ENCODING constants, as returned by the GetEncodingType() method. - - RETURN VALUE - Returns the decoded stream data. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function DecodeData ( $object_id, $stream_data, $type, $object_data ) - { - $decoded_stream_data = '' ; - - switch ( $type ) - { - case self::PDF_FLATE_ENCODING : - // Objects in password-protected Pdf files SHOULD be encrypted ; however, it happens that we may encounter normal, - // unencrypted ones. This is why we always try to gzuncompress them first then, if failed, try to decrypt them - $decoded_stream_data = @gzuncompress ( $stream_data ) ; - - if ( $decoded_stream_data === false ) - { - if ( $this -> IsEncrypted ) - { - $decoded_stream_data = $this -> EncryptionData -> Decrypt ( $object_id, $stream_data ) ; - - if ( $decoded_stream_data === false ) - { - if ( self::$DEBUG > 1 ) - warning ( new PdfToTextDecodingException ( "Unable to decrypt object contents.", $object_id ) ) ; - } - } - else if ( self::$DEBUG > 1 ) - warning ( new PdfToTextDecodingException ( "Invalid gzip data.", $object_id ) ) ; - } - - break ; - - case self::PDF_LZW_ENCODING : - $decoded_stream_data = $this -> __decode_lzw ( $stream_data ) ; - break ; - - case self::PDF_ASCIIHEX_ENCODING : - $decoded_stream_data = $this -> __decode_ascii_hex ( $stream_data ) ; - break ; - - case self::PDF_ASCII85_ENCODING : - $decoded_stream_data = $this -> __decode_ascii_85 ( $stream_data ) ; - - // Dumbly check if this could not be gzipped data after decoding (normally, the object flags should also specify - // the /FlateDecode flag) - if ( $decoded_stream_data !== false && ( $result = @gzuncompress ( $decoded_stream_data ) ) !== false ) - $decoded_stream_data = $result ; - - break ; - - case self::PDF_TEXT_ENCODING : - $decoded_stream_data = $stream_data ; - break ; - } - - return ( $decoded_stream_data ) ; - } - - - // __decode_lzw - - // Decoding function for LZW encrypted data. This function is largely inspired by the TCPDF one but has been rewritten - // for a performance gain of 30-35%. - private function __decode_lzw ( $data ) - { - // The initial dictionary contains 256 entries where each index is equal to its character representation - static $InitialDictionary = array - ( - "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\x09", "\x0A", "\x0B", "\x0C", "\x0D", "\x0E", "\x0F", - "\x10", "\x11", "\x12", "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1A", "\x1B", "\x1C", "\x1D", "\x1E", "\x1F", - "\x20", "\x21", "\x22", "\x23", "\x24", "\x25", "\x26", "\x27", "\x28", "\x29", "\x2A", "\x2B", "\x2C", "\x2D", "\x2E", "\x2F", - "\x30", "\x31", "\x32", "\x33", "\x34", "\x35", "\x36", "\x37", "\x38", "\x39", "\x3A", "\x3B", "\x3C", "\x3D", "\x3E", "\x3F", - "\x40", "\x41", "\x42", "\x43", "\x44", "\x45", "\x46", "\x47", "\x48", "\x49", "\x4A", "\x4B", "\x4C", "\x4D", "\x4E", "\x4F", - "\x50", "\x51", "\x52", "\x53", "\x54", "\x55", "\x56", "\x57", "\x58", "\x59", "\x5A", "\x5B", "\x5C", "\x5D", "\x5E", "\x5F", - "\x60", "\x61", "\x62", "\x63", "\x64", "\x65", "\x66", "\x67", "\x68", "\x69", "\x6A", "\x6B", "\x6C", "\x6D", "\x6E", "\x6F", - "\x70", "\x71", "\x72", "\x73", "\x74", "\x75", "\x76", "\x77", "\x78", "\x79", "\x7A", "\x7B", "\x7C", "\x7D", "\x7E", "\x7F", - "\x80", "\x81", "\x82", "\x83", "\x84", "\x85", "\x86", "\x87", "\x88", "\x89", "\x8A", "\x8B", "\x8C", "\x8D", "\x8E", "\x8F", - "\x90", "\x91", "\x92", "\x93", "\x94", "\x95", "\x96", "\x97", "\x98", "\x99", "\x9A", "\x9B", "\x9C", "\x9D", "\x9E", "\x9F", - "\xA0", "\xA1", "\xA2", "\xA3", "\xA4", "\xA5", "\xA6", "\xA7", "\xA8", "\xA9", "\xAA", "\xAB", "\xAC", "\xAD", "\xAE", "\xAF", - "\xB0", "\xB1", "\xB2", "\xB3", "\xB4", "\xB5", "\xB6", "\xB7", "\xB8", "\xB9", "\xBA", "\xBB", "\xBC", "\xBD", "\xBE", "\xBF", - "\xC0", "\xC1", "\xC2", "\xC3", "\xC4", "\xC5", "\xC6", "\xC7", "\xC8", "\xC9", "\xCA", "\xCB", "\xCC", "\xCD", "\xCE", "\xCF", - "\xD0", "\xD1", "\xD2", "\xD3", "\xD4", "\xD5", "\xD6", "\xD7", "\xD8", "\xD9", "\xDA", "\xDB", "\xDC", "\xDD", "\xDE", "\xDF", - "\xE0", "\xE1", "\xE2", "\xE3", "\xE4", "\xE5", "\xE6", "\xE7", "\xE8", "\xE9", "\xEA", "\xEB", "\xEC", "\xED", "\xEE", "\xEF", - "\xF0", "\xF1", "\xF2", "\xF3", "\xF4", "\xF5", "\xF6", "\xF7", "\xF8", "\xF9", "\xFA", "\xFB", "\xFC", "\xFD", "\xFE", "\xFF" - ) ; - - // Dictionary lengths - when we reach one of the values specified as the key, we have to set the bit length to the corresponding value - static $DictionaryLengths = array - ( - 511 => 10, - 1023 => 11, - 2047 => 12 - ) ; - - // Decoded string to be returned - $result = '' ; - - // Convert string to binary string - $bit_string = '' ; - $data_length = strlen ( $data ) ; - - for ( $i = 0 ; $i < $data_length ; $i ++ ) - $bit_string .= sprintf ( '%08b', ord ( $data[$i] ) ) ; - - $data_length *= 8 ; - - // Initialize dictionary - $bit_length = 9 ; - $dictionary_index = 258 ; - $dictionary = $InitialDictionary ; - - // Previous value - $previous_index = 0 ; - - // Start index in bit string - $start_index = 0 ; - - // Until we encounter the EOD marker (257), read $bit_length bits - while ( ( $start_index < $data_length ) && ( ( $index = bindec ( substr ( $bit_string, $start_index, $bit_length ) ) ) !== 257 ) ) - { - // Move to next bit position - $start_index += $bit_length ; - - if ( $index !== 256 && $previous_index !== 256 ) - { - // Check if index exists in the dictionary and remember it - if ( $index < $dictionary_index ) - { - $result .= $dictionary [ $index ] ; - $dictionary_value = $dictionary [ $previous_index ] . $dictionary [ $index ] [0] ; - $previous_index = $index ; - } - // Index does not exist - add it to the dictionary - else - { - $dictionary_value = $dictionary [ $previous_index ] . $dictionary [ $previous_index ] [0] ; - $result .= $dictionary_value ; - } - - // Update dictionary - $dictionary [ $dictionary_index ++ ] = $dictionary_value ; - - // Change bit length whenever we reach an index limit - if ( isset ( $DictionaryLengths [ $dictionary_index ] ) ) - $bit_length = $DictionaryLengths [ $dictionary_index ] ; - } - // Clear table marker - else if ( $index === 256) - { - // Reset dictionary and bit length - // Reset dictionary and bit length - $bit_length = 9 ; - $dictionary_index = 258 ; - $previous_index = 256 ; - $dictionary = $InitialDictionary ; - } - // First entry - else // $previous_index === 256 - { - // first entry - $result .= $dictionary [ $index ] ; - $previous_index = $index ; - } - } - - // All done, return - return ( $result ) ; - } - - - // __decode_ascii_hex - - // Decoder for /AsciiHexDecode streams. - private function __decode_ascii_hex ( $input ) - { - $output = "" ; - $is_odd = true ; - $is_comment = false ; - - for ( $i = 0, $codeHigh = -1 ; $i < strlen ( $input ) && $input [ $i ] != '>' ; $i++ ) - { - $c = $input [ $i ] ; - - if ( $is_comment ) - { - if ( $c == '\r' || $c == '\n' ) - $is_comment = false ; - - continue; - } - - switch ( $c ) - { - case '\0' : - case '\t' : - case '\r' : - case '\f' : - case '\n' : - case ' ' : - break ; - - case '%' : - $is_comment = true ; - break ; - - default : - $code = hexdec ( $c ) ; - - if ( $code === 0 && $c != '0' ) - return ( '' ) ; - - if ( $is_odd ) - $codeHigh = $code ; - else - $output .= chr ( ( $codeHigh << 4 ) | $code ) ; - - $is_odd = ! $is_odd ; - break ; - } - } - - if ( $input [ $i ] != '>' ) - return ( '' ) ; - - if ( $is_odd ) - $output .= chr ( $codeHigh << 4 ) ; - - return ( $output ) ; - } - - - // __decode_ascii_85 - - // Decoder for /Ascii85Decode streams. - private function __decode_ascii_85 ( $data ) - { - // Ordinal value of the first character used in Ascii85 encoding - static $first_ord = 33 ; - // "A 'z' in the input data means "sequence of 4 nuls" - static $z_exception = "\0\0\0\0" ; - // Powers of 85, from 4 to 0 - static $exp85 = array ( 52200625, 614125, 7225, 85, 1 ) ; - - // Ignore empty data - if ( $data === '' ) - return ( false ) ; - - $data_length = strlen ( $data ) ; - $ords = array ( ) ; - $ord_count = 0 ; - $result = '' ; - - // Paranoia : Ascii85 data may start with '<~' (but it always end with '~>'). Anyway, we must start past this construct if present - if ( $data [0] == '<' && $data [1] == '~' ) - $start = 2 ; - else - $start = 0 ; - - // Loop through nput characters - for ( $i = $start ; $i < $data_length && $data [$i] != '~' ; $i ++ ) - { - $ch = $data [$i] ; - - // Most common case : current character is in the range of the Ascii85 encoding ('!'..'u') - if ( $ch >= '!' && $ch <= 'u' ) - $ords [ $ord_count ++ ] = ord ( $ch ) - $first_ord ; - // 'z' is replaced with a sequence of null bytes - else if ( $ch == 'z' && ! $ord_count ) - $result .= $z_exception ; - // Spaces are ignored - else if ( $ch !== "\0" && $ch !== "\t" && $ch !== ' ' && $ch !== "\r" && $ch !== "\n" && $ch !== "\f" ) - continue ; - // Other characters : corrupted data... - else - return ( false ) ; - - // We have collected 5 characters in base 85 : convert their 32-bits value to base 2 (3 characters) - if ( $ord_count == 5 ) - { - $ord_count = 0 ; - - for ( $sum = 0, $j = 0 ; $j < 5 ; $j ++ ) - $sum = ( $sum * 85 ) + $ords [ $j ] ; - - for ( $j = 3 ; $j >= 0 ; $j -- ) - $result .= chr ( $sum >> ( $j * 8 ) ) ; - } - } - - // A last processing for the potential remaining bytes - // Notes : this situation has never been tested - if ( $ord_count ) - { - for ( $i = 0, $sum = 0 ; $i < $ord_count ; $i++ ) - $sum += ( $ords [ $i ] + ( $i == $ord_count - 1 ) ) * $exp85 [$i] ; - - for ( $i = 0 ; $i < $ord_count - 1 ; $i++ ) - $result .= chr ( $sum >> ( ( 3 - $i ) * 8 ) ) ; - } - - // All done, return - return ( $result ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - DecodeImage - Returns decoded image contents. - - PROTOTYPE - TBC - - DESCRIPTION - description - - PARAMETERS - $object_id (integer) - - Pdf object number. - - $stream_data (string) - - Object data. - - $type (integer) - - One of the PdfToText::PDF_*_ENCODING constants. - - $autosave (boolean) - - When autosave is selected, images will not be decoded into memory unless they have a format - different from JPEG. This is intended to save memory. - - RETURN VALUE - Returns an object of type PdfIMage, or false if the image encoding type is not currently supported. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function DecodeImage ( $object_id, $stream_data, $type, $object_data, $autosave ) - { - switch ( $type ) - { - // Normal JPEG image - case self::PDF_DCT_ENCODING : - return ( new PdfJpegImage ( $stream_data, $autosave ) ) ; - - // CCITT fax image - case self::PDF_CCITT_FAX_ENCODING : - return ( new PdfFaxImage ( $stream_data ) ) ; - - // For now, I have not found enough information to be able to decode image data in an inflated stream... - // In some cases, however, this is JPEG data - case self::PDF_FLATE_ENCODING : - $image = PdfInlinedImage::CreateInstance ( $stream_data, $object_data, $autosave ) ; - - if ( $image ) - return ( $image ) ; - - break ; - - default : - return ( false ) ; - } - - return ( false ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - DecodeObjectStream - Decodes an object stream. - - PROTOTYPE - $array = $this -> DecodeObjectStream ( $object_id, $object_data ) ; - - DESCRIPTION - Decodes an object stream. An object stream is yet another PDF object type that contains itself several - objects not defined using the "x y obj ... endobj" syntax. - As far as I understood, object streams data is contained within stream/endstream delimiters, and is - gzipped. - Object streams start with a set of object id/offset pairs separated by a space ; catenated object data - immediately follows the last space ; for example : - - 1167 0 1168 114 <>/Font<>>>/Fields[]>>[/ICCBased 1156 0 R] - - The above example specifies two objects : - . Object #1167, which starts at offset 0 and ends before the second object, at offset #113 in - the data. The contents are : - <>/Font<>>>/Fields[]>> - . Object #1168, which starts at offset #114 and continues until the end of the object stream. - It contains the following data : - [/ICCBased 1156 0 R] - - PARAMETERS - $object_id (integer) - - Pdf object number. - - $object_data (string) - - Object data. - - RETURN VALUE - Returns false if any error occurred (mainly for syntax reasons). - Otherwise, returns an associative array containing the following elements : - - object_id : - Array of all the object ids contained in the object stream. - - object : - Array of corresponding object data. - - The reason for this format is that it is identical to the array returned by the preg_match() function - used in the Load() method for finding objects in a PDF file (ie, a regex that matches "x y oj/endobj" - constructs). - - *-------------------------------------------------------------------------------------------------------------*/ - protected function DecodeObjectStream ( $object_id, $object_data ) - { - // Extract gzipped data for this object - if ( preg_match ( '#[^/] stream ( (\r? \n) | \r ) (?P .*?) endstream#imsx', $object_data, $stream_match ) ) - { - $stream_data = $stream_match [ 'stream' ] ; - $type = $this -> GetEncodingType ( $object_id, $object_data ) ; - $decoded_data = $this -> DecodeData ( $object_id, $stream_data, $type, $object_data ) ; - - if ( self::$DEBUG > 1 ) - echo "\n----------------------------------- OBJSTREAM #$object_id\n$decoded_data" ; - } - // Stay prepared to find one day a sample declared as an object stream but not having gzipped data delimited by stream/endstream tags - else - { - if ( self::$DEBUG > 1 ) - error ( new PdfToTextDecodingException ( "Found object stream without gzipped data", $object_id ) ) ; - - return ( false ) ; - } - - // Object streams data start with a series of object id/offset pairs. The offset is absolute to the first character - // after the last space of these series. - // Note : on Windows platforms, the default stack size is 1Mb. The following regular expression will make Apache crash in most cases, - // so you have to enable the following lines in your http.ini file to set a stack size of 8Mb, as for Unix systems : - // Include conf/extra/httpd-mpm.conf - // ThreadStackSize 8388608 - if ( ! preg_match ( '/^ \s* (?P (\d+ \s* )+ )/x', $decoded_data, $series_match ) ) - { - if ( self::$DEBUG > 1 ) - error ( new PdfToTextDecodingException ( "Object stream does not start with integer object id/offset pairs.", $object_id ) ) ; - - return ( false ) ; - } - - // Extract the series of object id/offset pairs and the stream object data - $series = explode ( ' ', rtrim ( preg_replace ( '/\s+/', ' ', $series_match [ 'series' ] ) ) ) ; - $data = substr ( $decoded_data, strlen ( $series_match [ 'series' ] ) ) ; - - // $series should contain an even number of values - if ( count ( $series ) % 2 ) - { - if ( self::$DEBUG ) - warning ( new PdfToTextDecodingException ( "Object stream should start with an even number of integer values.", $object_id ) ) ; - - array_pop ( $series ) ; - } - - // Extract every individual object - $objects = array ( 'object_id' => array ( ), 'object' => array ( ) ) ; - - for ( $i = 0, $count = count ( $series ) ; $i < $count ; $i += 2 ) - { - $object_id = ( integer ) $series [$i] ; - $offset = ( integer ) $series [$i+1] ; - - // If there is a "next" object, extract only a substring within the object stream contents - if ( isset ( $series [ $i + 3 ] ) ) - $object_contents = substr ( $data, $offset, $series [ $i + 3 ] - $offset ) ; - // Otherwise, extract everything until the end - else - $object_contents = substr ( $data, $offset ) ; - - $objects [ 'object_id'] [] = $object_id ; - $objects [ 'object' ] [] = $object_contents ; - } - - return ( $objects ) ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - ExtractTextData - Extracts text, header & footer information from a text object. - - PROTOTYPE - $this -> ExtractTextData ( $object_id, $stream_contents, &$text, &$header, &$footer ) ; - - DESCRIPTION - Extracts text, header & footer information from a text object. The extracted text contents will be - stripped from any header/footer information. - - PARAMETERS - $text (string) - - Variable that will receive text contents. - - $header, $footer (string) - - Variables that will receive header and footer information. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function ExtractTextData ( $object_id, $stream_contents, &$text, &$header, &$footer ) - { - // Normally, a header or footer is introduced with a construct like : - // << /Type /Pagination ... [/Bottom] ... >> (or [/Top] - // The initial regular expression was : - // << .*? \[ \s* / (?P (Bottom) | (Top) ) \s* \] .*? >> \s* BDC .*? EMC - // (the data contained between the BDC and EMC instructions are text-drawing instructions). - // However, this expression revealed to be too greedy and captured too much data ; in the following example : - // <> ...(several kb of drawing instructions)... << ... [/Bottom] ... >> BDC (other drawing instructions for the page footer) EMC - // everything was captured, from the initial "<>" to the final "EMC", which caused regular page contents to be interpreted as page bottom - // contents. - // The ".*?" in the regex has been replaced with "[^>]*?", which works better. However, it will fail to recognize header/footer contents if - // the header/footer declaration contains a nested construct , such as : - // << /Type /Pagination ... [/Bottom] ... << (some nested contents) >> ... >> (or [/Top] - // Let's wait for the case to happen one day... - static $header_or_footer_re = '# - (?P - << [^>]*? \[ \s* / (?P (Bottom) | (Top) ) \s* \] [^>]*? >> \s* - BDC .*? EMC - ) - #imsx' ; - - $header = - $footer = - $text = '' ; - - if ( preg_match_all ( $header_or_footer_re, $stream_contents, $matches, PREG_OFFSET_CAPTURE ) ) - { - for ( $i = 0, $count = count ( $matches [ 'contents' ] ) ; $i < $count ; $i ++ ) - { - if ( ! strcasecmp ( $matches [ 'location' ] [$i] [0], 'Bottom' ) ) - $footer = $matches [ 'contents' ] [$i] [0] ; - else - $header = $matches [ 'contents' ] [$i] [0] ; - } - - $text = preg_replace ( $header_or_footer_re, '', $stream_contents ) ; - } - else - $text = $stream_contents ; - } - - - /*-------------------------------------------------------------------------------------------------------------- - - NAME - ExtractText - extracts text from a pdf stream. - - PROTOTYPE - $text = $this -> ExtractText ( $page_number, $object_id, $data, &$current_font ) ; - - DESCRIPTION - Extracts text from decoded stream contents. - - PARAMETERS - $page_number (integer) - - Page number that contains the text to be extracted. - - $object_id (integer) - - Object id of this text block. - - $data (string) - - Stream contents. - - $current_font (integer) - - Id of the current font, which should be found in the $this->FontTable property, if anything - went ok. - This parameter is required, since text blocks may not specify a new font resource id and reuse - the one that waas set before. - - RETURN VALUE - Returns the decoded text. - - NOTES - The PDF language can be seen as a stack-driven language ; for example, the instruction defining a text - matrix ( "Tm" ) expects 6 floating-point values from the stack : - - 0 0 0 0 x y Tm - - It can also specify specific operators, such as /Rx, which sets font number "x" to be the current font, - or even "<< >>" constructs that we can ignore during our process of extracting textual data. - Actually, we only want to handle a very small subset of the Adobe drawing language ; These are : - - "Tm" instructions, that specify, among others, the x and y coordinates of the next text to be output - - "/R" instructions, that specify which font is to be used for the next text output. This is useful - only if the font has an associated character map. - - "/F", same as "/R", but use a font map id instead of a direct object id. - - Text, specified either using a single notation ( "(sometext)" ) or the array notation - ( "[(...)d1(...)d2...(...)]" ), which allows for specifying inter-character spacing. - - "Tf" instructions, that specifies the font size. This is to be able to compute approximately the - number of empty lines between two successive Y coordinates in "Tm" instructions - - "TL" instructions, that define the text leading to be used by "T*" - - This is why I choosed to decompose the process of text extraction into three steps : - - The first one, the lowest-level step, is a tokenizer that extracts individual elements, such as "Tm", - "TJ", "/Rx" or "510.77". This is handled by the __next_token() method. - - The second one, __next_instruction(), collects tokens. It pushes every floating-point value onto the - stack, until an instruction is met. - - The third one, ExtractText(), processes data returned by __next_instruction(), and actually performs - the (restricted) parsing of text drawing instructions. - - *-------------------------------------------------------------------------------------------------------------*/ - protected function ExtractText ( $page_number, $object_id, $data, &$current_font ) - { - $new_data = $this -> __strip_useless_instructions ( $data ) ; - - if ( self::$DEBUG ) - { - echo "\n----------------------------------- TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ; - echo $data ; - echo "\n----------------------------------- OPTIMIZED TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ; - echo $new_data ; - } - - $data = $new_data ; - - // Index into the specified block of text-drawing instructions - $data_index = 0 ; - - $data_length = strlen ( $data ) ; // Data length - $result = '' ; // Resulting string - - // Y-coordinate of the last seen "Tm" instruction - $last_goto_y = 0 ; - $last_goto_x = 0 ; - - // Y-coordinate of the last seen "Td" or "TD" relative positioning instruction - $last_relative_goto_y = 0 ; - - // When true, the current text should be output on the same line as the preceding one - $use_same_line = false ; - - // Instruction preceding the current one - $last_instruction = true ; - - // Current font size - $current_font_size = 0 ; - - // Active template - $current_template = '' ; - - // Various pre-computed variables - $separator_length = strlen ( $this -> Separator ) ; - - // Current font map width, in bytes, plus a flag saying whether the current font is mapped or not - $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ; - - // Extra newlines to add before the current text - $extra_newlines = 0 ; - - // Text leading used by T* - $text_leading = 0 ; - - // Set to true if a separator needs to be inserted - $needs_separator = false ; - - // A flag to tell if we should "forget" the last instruction - $discard_last_instruction = false ; - - // A flag that tells whether the Separator and BlockSeparator properties are identical - $same_separators = ( $this -> Separator == $this -> BlockSeparator ) ; - - // Instruction count (used for handling execution timeouts) - $instruction_count = 0 ; - - // Unprocessed markers - $unprocessed_marker_count = count ( $this -> UnprocessedMarkerList [ 'font' ] ) ; - - // Loop through instructions - while ( ( $instruction = $this -> __next_instruction ( $page_number, $data, $data_length, $data_index, $current_template ) ) !== false ) - { - $fragment = '' ; - - $instruction_count ++ ; - - // Timeout handling - don't test for every instruction processed - if ( ! ( $instruction_count % 100 ) ) - { - // Global timeout handling - if ( $this -> Options & self::PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME ) - { - $now = microtime ( true ) ; - - if ( $now - self::$GlobalExecutionStartTime > self::$MaxGlobalExecutionTime ) - error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", true, self::$PhpMaxExecutionTime, self::$MaxGlobalExecutionTime ) ) ; - } - - // Per-instance timeout handling - if ( $this -> Options & self::PDFOPT_ENFORCE_EXECUTION_TIME ) - { - $now = microtime ( true ) ; - - if ( $now - $this -> ExecutionStartTime > $this -> MaxExecutionTime ) - error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", false, self::$PhpMaxExecutionTime, $this -> MaxExecutionTime ) ) ; - } - } - - // Character position after the current instruction - $data_index = $instruction [ 'next' ] ; - - // Process current instruction - switch ( $instruction [ 'instruction' ] ) - { - // Raw text (enclosed by parentheses) or array text (enclosed within square brackets) - // is returned as a single instruction - case 'text' : - // Empty arrays of text may be encountered - ignore them - if ( ! count ( $instruction [ 'values' ] ) ) - break ; - - // Check if we have to insert a newline - if ( ! $use_same_line ) - { - $fragment .= $this -> EOL ; - $needs_separator = false ; - } - // Roughly simulate spacing between lines by inserting newline characters - else if ( $extra_newlines > 0 ) - { - $fragment .= str_repeat ( $this -> EOL, $extra_newlines ) ; - $extra_newlines = 0 ; - $needs_separator = false ; - } - else - $needs_separator = true ; - - // Add a separator if necessary - if ( $needs_separator ) - { - // If the Separator and BlockSeparator properties are the same (and not empty), only add a block separator if - // the current result does not end with it - if ( $same_separators ) - { - if ( $this -> Separator != '' && substr ( $fragment, - $separator_length ) != $this -> BlockSeparator ) - $fragment .= $this -> BlockSeparator ; - } - else - $fragment .= $this -> BlockSeparator ; - } - - $needs_separator = true ; - $value_index = 0 ; - - // Fonts having character maps will require some special processing - if ( $current_font_mapped ) - { - // Loop through each text value - foreach ( $instruction [ 'values' ] as $text ) - { - $is_hex = ( $text [0] == '<' ) ; - $length = strlen ( $text ) - 1 ; - $handled = false ; - - // Characters are encoded within angle brackets ( "<>" ). - // Note that several characters can be specified within the same angle brackets, so we have to take - // into account the width we detected in the begincodespancerange construct - if ( $is_hex ) - { - for ( $i = 1 ; $i < $length ; $i += $current_font_map_width ) - { - $value = substr ( $text, $i, $current_font_map_width ) ; - $ch = hexdec ( $value ) ; - - if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) ) - $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ; - else if ( $current_font == -1 ) - { - $newchar = chr ( $ch ) ; - } - else - { - $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch ) ; - $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ; - } - - $fragment .= $newchar ; - } - - $handled = true ; - } - // Yes ! double-byte codes can also be specified as plain text within parentheses ! - // However, we have to be really careful here ; the sequence : - // (Be) - // can mean the string "Be" or the Unicode character 0x4265 ('B' = 0x42, 'e' = 0x65) - // We first look if the character map contains an entry for Unicode codepoint 0x4265 ; - // if not, then we have to consider that it is regular text to be taken one character by - // one character. In this case, we fall back to the "if ( ! $handled )" condition - else if ( $current_font_map_width == 4 ) - { - $temp_result = '' ; - - for ( $i = 1 ; $i < $length ; $i ++ ) - { - // Each character in the pair may be a backslash, which escapes the next character so we must skip it - // This code needs to be reviewed ; the same code is duplicated to handle escaped characters in octal notation - if ( $text [$i] != '\\' ) - $ch1 = $text [$i] ; - else - { - $i ++ ; - - if ( $text [$i] < '0' || $text [$i] > '7' ) - $ch1 = $this -> ProcessEscapedCharacter ( $text [$i] ) ; - else - { - $oct = '' ; - $digit_count = 0 ; - - while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 ) - { - $oct .= $text [$i ++] ; - $digit_count ++ ; - } - - $ch1 = chr ( octdec ( $oct ) ) ; - $i -- ; - } - } - - $i ++ ; - - if ( $text [$i] != '\\' ) - $ch2 = $text [$i] ; - else - { - $i ++ ; - - if ( $text [$i] < '0' || $text [$i] > '7' ) - $ch2 = $this -> ProcessEscapedCharacter ( $text [$i] ) ; - else - { - $oct = '' ; - $digit_count = 0 ; - - while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 ) - { - $oct .= $text [$i ++] ; - $digit_count ++ ; - } - - $ch2 = chr ( octdec ( $oct ) ) ; - $i -- ; - } - } - - // Build the 2-bytes character code - $ch = ( ord ( $ch1 ) << 8 ) | ord ( $ch2 ) ; - - if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) ) - $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ; - else - { - $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch, true ) ; - $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ; - } - - // Yes !!! for characters encoded with two bytes, we can find the following construct : - // 0x00 "\" "(" 0x00 "C" 0x00 "a" 0x00 "r" 0x00 "\" ")" - // which must be expanded as : (Car) - // We have here the escape sequences "\(" and "\)", but the backslash is encoded on two bytes - // (although the MSB is nul), while the escaped character is encoded on 1 byte. waiting - // for the next quirk to happen... - if ( $newchar == '\\' && isset ( $text [ $i + 2 ] ) ) - { - $newchar = $this -> ProcessEscapedCharacter ( $text [ $i + 2 ] ) ; - $i ++ ; // this time we processed 3 bytes, not 2 - } - - $temp_result .= $newchar ; - } - - // Happens only if we were unable to translate a character using the current character map - $fragment .= $temp_result ; - $handled = true ; - } - - // Character strings within parentheses. - // For every text value, use the character map table for substitutions - if ( ! $handled ) - { - for ( $i = 1 ; $i < $length ; $i ++ ) - { - $ch = $text [$i] ; - - // Set to true to optimize calls to MapCharacters - // Currently does not work with pobox@dizy.sk/infoma.pdf (a few characters differ) - $use_map_buffer = false ; - - // ... but don't forget to handle escape sequences "\n" and "\r" for characters - // 10 and 13 - if ( $ch == '\\' ) - { - $ch = $text [++$i] ; - - // Escaped character - if ( $ch < '0' || $ch > '7' ) - $ch = $this -> ProcessEscapedCharacter ( $ch ) ; - // However, an octal form can also be specified ; in this case we have to take into account - // the character width for the current font (if the character width is 4 hex digits, then we - // will encounter constructs such as "\000\077"). - // The method used here is dirty : we build a regex to match octal character representations on a substring - // of the text - else - { - $width = $current_font_map_width / 2 ; // Convert to byte count - $subtext = substr ( $text, $i - 1 ) ; - $regex = "#^ (\\\\ [0-7]{3}){1,$width} #imsx" ; - - $status = preg_match ( $regex, $subtext, $octal_matches ) ; - - if ( $status ) - { - $octal_values = explode ( '\\', substr ( $octal_matches [0], 1 ) ) ; - $ord = 0 ; - - foreach ( $octal_values as $octal_value ) - $ord = ( $ord << 8 ) + octdec ( $octal_value ) ; - - $ch = chr ( $ord ) ; - $i += strlen ( $octal_matches [0] ) - 2 ; - } - } - - $use_map_buffer = false ; - } - - // Add substituted character to the output result - $ord = ord ( $ch ) ; - - if ( ! $use_map_buffer ) - $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ; - else - { - if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) ) - $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ; - else - { - $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ; - $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ; - } - } - - $fragment .= $newchar ; - } - } - - // Handle offsets between blocks of characters - if ( isset ( $instruction [ 'offsets' ] [ $value_index ] ) && - - ( $instruction [ 'offsets' ] [ $value_index ] ) > $this -> MinSpaceWidth ) - $fragment .= $this -> __get_character_padding ( $instruction [ 'offsets' ] [ $value_index ] ) ; - - $value_index ++ ; - } - } - // For fonts having no associated character map, we simply encode the string in UTF8 - // after the C-like escape sequences have been processed - // Note that constructs can be encountered here, so we have to process them as well - else - { - foreach ( $instruction [ 'values' ] as $text ) - { - $is_hex = ( $text [0] == '<' ) ; - $length = strlen ( $text ) - 1 ; - - // Some text within parentheses may have a backslash followed by a newline, to indicate some continuation line. - // Example : - // (this is a sentence \ - // continued on the next line) - // Funny isn't it ? so remove such constructs because we don't care - $text = str_replace ( array ( "\\\r\n", "\\\r", "\\\n" ), '', $text ) ; - - // Characters are encoded within angle brackets ( "<>" ) - if ( $is_hex ) - { - for ( $i = 1 ; $i < $length ; $i += 2 ) - { - $ch = hexdec ( substr ( $text, $i, 2 ) ) ; - - $fragment .= $this -> CodePointToUtf8 ( $ch ) ; - } - } - // Characters are plain text - else - { - $text = self::Unescape ( $text ) ; - - for ( $i = 1, $length = strlen ( $text ) - 1 ; $i < $length ; $i ++ ) - { - $ch = $text [$i] ; - $ord = ord ( $ch ) ; - - if ( $ord < 127 ) - $newchar = $ch ; - else - { - if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) ) - $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ; - else - { - $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ; - $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ; - } - } - - $fragment .= $newchar ; - } - } - - // Handle offsets between blocks of characters - if ( isset ( $instruction [ 'offsets' ] [ $value_index ] ) && - abs ( $instruction [ 'offsets' ] [ $value_index ] ) > $this -> MinSpaceWidth ) - $fragment .= $this -> __get_character_padding ( $instruction [ 'offsets' ] [ $value_index ] ) ; - - $value_index ++ ; - } - } - - // Process the markers which do not have an associated font yet - this will be done by matching - // the current text fragment against one of the regular expressions defined. - // If a match occurs, then all the subsequent text fragment using the same font will be put markers - for ( $j = 0 ; $j < $unprocessed_marker_count ; $j ++ ) - { - $marker = $this -> UnprocessedMarkerList [ 'font' ] [$j] ; - - if ( preg_match ( $marker [ 'regex' ], trim ( $fragment ) ) ) - { - $this -> TextWithFontMarkers [ $current_font ] = array - ( - 'font' => $current_font, - 'height' => $current_font_size, - 'regex' => $marker [ 'regex' ], - 'start' => $marker [ 'start' ], - 'end' => $marker [ 'end' ] - ) ; - - $unprocessed_marker_count -- ; - unset ( $this -> UnprocessedMarkerList [ 'font' ] [$j] ) ; - - break ; - } - } - - // Check if we need to add markers around this text fragment - if ( isset ( $this -> TextWithFontMarkers [ $current_font ] ) && - $this -> TextWithFontMarkers [ $current_font ] [ 'height' ] == $current_font_size ) - { - $fragment = $this -> TextWithFontMarkers [ $current_font ] [ 'start' ] . - $fragment . - $this -> TextWithFontMarkers [ $current_font ] [ 'end' ] ; - } - - $result .= $fragment ; - - break ; - - // An "nl" instruction means TJ, Tj, T* or "'" - case 'nl' : - if ( ! $instruction [ 'conditional' ] ) - { - if ( $instruction [ 'leading' ] && $text_leading && $current_font_size ) - { - $count = ( integer ) ( ( $text_leading - $current_font_size ) / $current_font_size ) ; - - if ( ! $count ) - $count = 1 ; - } - else - $count = 1 ; - - $extra = str_repeat ( PHP_EOL, $count ) ; - $result .= $extra ; - $needs_separator = false ; - $last_goto_y -= ( $count * $text_leading ) ; // Approximation on y-coord change - $last_relative_goto_y = 0 ; - } - - break ; - - // "Tm", "Td" or "TD" : Output text on the same line, if the "y" coordinates are equal - case 'goto' : - // Some text is positioned using 'Tm' instructions ; however they can be immediatley followed by 'Td' instructions - // which give a relative positioning ; so consider that the last instruction wins - if ( $instruction [ 'relative' ] ) - { - // Try to put a separator if the x coordinate is non-zero - //if ( $instruction [ 'x' ] - $last_goto_x >= $current_font_size ) - // $result .= $this -> Separator ; - - $discard_last_instruction = true ; - $extra_newlines = 0 ; - $use_same_line = ( ( $last_relative_goto_y - abs ( $instruction [ 'y' ] ) ) <= $current_font_size ) ; - $last_relative_goto_y = abs ( $instruction [ 'y' ] ) ; - $last_goto_x = $instruction [ 'x' ] ; - - if ( - $instruction [ 'y' ] > $current_font_size ) - { - $use_same_line = false ; - - if ( $last_relative_goto_y ) - $extra_newlines = ( integer ) ( $current_font_size / $last_relative_goto_y ) ; - else - $extra_newlines = 0 ; - } - else if ( ! $instruction [ 'y' ] ) - { - $use_same_line = true ; - $extra_newlines = 0 ; - } - - break ; - } - else - $last_relative_goto_y = 0 ; - - $y = $last_goto_y + $last_relative_goto_y ; - - if ( $instruction [ 'y' ] == $y || abs ( $instruction [ 'y' ] - $y ) < $current_font_size ) - { - $use_same_line = true ; - $extra_newlines = 0 ; - } - else - { - // Compute the number of newlines we have to insert between the current and the next lines - if ( $current_font_size ) - $extra_newlines = ( integer ) ( ( $y - $instruction [ 'y' ] - $current_font_size ) / $current_font_size ) ; - - $use_same_line = ( $last_goto_y == 0 ) ; - } - - $last_goto_y = $instruction [ 'y' ] ; - break ; - - // Set font size - case 'fontsize' : - $current_font_size = $instruction [ 'size' ] ; - break ; - - // "/Rx" : sets the current font - case 'resource' : - $current_font = $instruction [ 'resource' ] ; - - $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ; - break ; - - // "/TPLx" : references a template, which can contain additional font aliases - case 'template' : - if ( $this -> PageMap -> IsValidXObjectName ( $instruction [ 'token' ] ) ) - $current_template = $instruction [ 'token' ] ; - - break ; - - // 'TL' : text leading to be used for the next "T*" in the flow - case 'leading' : - if ( ! ( $this -> Options & self::PDFOPT_IGNORE_TEXT_LEADING ) ) - $text_leading = $instruction [ 'size' ] ; - - break ; - - - // 'ET' : we have to reset a few things here - case 'ET' : - $current_font = -1 ; - $current_font_map_width = 2 ; - break ; - } - - // Remember last instruction - this will help us into determining whether we should put the next text - // on the current or following line - if ( ! $discard_last_instruction ) - $last_instruction = $instruction ; - - $discard_last_instruction = false ; - } - - return ( $this -> __rtl_process ( $result ) ) ; - } - - - - // __next_instruction - - // Retrieves the next instruction from the drawing text block. - private function __next_instruction ( $page_number, $data, $data_length, $index, $current_template ) - { - static $last_instruction = false ; - - $ch = '' ; - - // Constructs such as - if ( $last_instruction ) - { - $result = $last_instruction ; - $last_instruction = false ; - - return ( $result ) ; - } - - // Whether we should compute enhanced statistics - $enhanced_statistics = $this -> EnhancedStatistics ; - - // Holds the floating-point values encountered so far - $number_stack = array ( ) ; - - // Loop through the stream of tokens - while ( ( $part = $this -> __next_token ( $page_number, $data, $data_length, $index ) ) !== false ) - { - $token = $part [0] ; - $next_index = $part [1] ; - - // Floating-point number : push it onto the stack - if ( ( $token [0] >= '0' && $token [0] <= '9' ) || $token [0] == '-' || $token [0] == '+' || $token [0] == '.' ) - { - $number_stack [] = $token ; - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ; - } - // 'Tm' instruction : return a "goto" instruction with the x and y coordinates - else if ( $token == 'Tm' ) - { - $x = $number_stack [4] ; - $y = $number_stack [5] ; - - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tm' ] ++ ; - - return ( array ( 'instruction' => 'goto', 'next' => $next_index, 'x' => $x, 'y' => $y, 'relative' => false, 'token' => $token ) ) ; - } - // 'Td' or 'TD' instructions : return a goto instruction with the x and y coordinates (1st and 2nd args) - else if ( $token == 'Td' || $token == 'TD' ) - { - $x = $number_stack [0] ; - $y = $number_stack [1] ; - - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ $token ] ++ ; - - return ( array ( 'instruction' => 'goto', 'next' => $next_index, 'x' => $x, 'y' => $y, 'relative' => true, 'token' => $token ) ) ; - } - // Output text "'" instruction, with conditional newline - else if ( $token [0] == "'" ) - { - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ "'" ] ++ ; - - return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => false, 'token' => $token ) ) ; - } - // Same as above - else if ( $token == 'TJ' || $token == 'Tj' ) - { - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ $token ] ++ ; - - return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => false, 'token' => $token ) ) ; - } - // Set font size - else if ( $token == 'Tf' ) - { - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tf' ] ++ ; - - return ( array ( 'instruction' => 'fontsize', 'next' => $next_index, 'size' => $number_stack [0], 'token' => $token ) ) ; - } - // Text leading (spacing used by T*) - else if ( $token == 'TL' ) - { - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TL' ] ++ ; - - return ( array ( 'instruction' => 'leading', 'next' => $next_index, 'size' => $number_stack [0], 'token' => $token ) ) ; - } - // Position to next line - else if ( $token == 'T*' ) - { - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'T*' ] ++ ; - - return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => true ) ) ; - } - // Draw object ("Do"). To prevent different text shapes to appear on the same line, we return a "newline" instruction - // here. Note that the shape position is not taken into account here, and shapes will be processed in the order they - // appear in the pdf file (which is likely to be different from their position on a graphic screen). - else if ( $token == 'Do' ) - { - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ; - - return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => false, 'token' => $token ) ) ; - } - // Raw text output - else if ( $token [0] == '(' ) - { - $next_part = $this -> __next_token ( $page_number, $data, $data_length, $next_index, $enhanced_statistics ) ; - $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ; - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '(' ] ++ ; - - if ( $next_part [0] == "'" ) - { - $last_instruction = $instruction ; - return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => true, 'token' => $token ) ) ; - } - else - return ( $instruction ) ; - } - // Hex digits within angle brackets - else if ( $token [0] == '<' ) - { - $ch = $token [1] ; - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '<' ] ++ ; - $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ; - - if ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM ) - { - $next_part = $this -> __next_token ( $page_number, $data, $data_length, $next_index ) ; - $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ; - - if ( $next_part [0] == "'" ) - { - $last_instruction = $instruction ; - return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => true, 'token' => $token ) ) ; - } - else - return ( $instruction ) ; - } - } - // Text specified as an array of individual raw text elements, and individual interspaces between characters - else if ( $token [0] == '[' ) - { - $values = $this -> __extract_chars_from_array ( $token ) ; - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '[' ] ++ ; - $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => $values [0], 'offsets' => $values [1], 'token' => $token ) ; - - return ( $instruction ) ; - } - // Token starts with a slash : maybe a font specification - else if ( preg_match ( '#^ ( ' . self::$FontSpecifiers . ' ) #ix', $token ) ) - { - $key = "$page_number:$current_template:$token" ; - $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ; - - if ( isset ( $this -> MapIdBuffer [ $key ] ) ) - $id = $this -> MapIdBuffer [ $key ] ; - else - { - $id = $this -> FontTable -> GetFontByMapId ( $page_number, $current_template, $token ) ; - - $this -> MapIdBuffer [ $key ] = $id ; - } - - return ( array ( 'instruction' => 'resource', 'next' => $next_index, 'resource' => $id, 'token' => $token ) ) ; - } - // Template reference, such as /TPL1. Each reference has initially been replaced by !PDFTOTEXT_TEMPLATE_TPLx during substitution - // by ProcessTemplateReferences(), because templates not only specify text to be replaced, but also font aliases - // -and this is the place where we catch font aliases in this case - else if ( preg_match ( '/ !PDFTOTEXT_TEMPLATE_ (?P