forked from michelcrypt4d4mus/pdfalyzer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfont_info.py
161 lines (131 loc) · 6.51 KB
/
font_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Unify font information spread across a bunch of PdfObjects (Font, FontDescriptor,
and FontFile) into a single class.
"""
from pypdf._cmap import build_char_map, prepare_cm
from pypdf.generic import IndirectObject, PdfObject
from rich.text import Text
from yaralyzer.output.rich_console import console
from yaralyzer.util.logging import log
from pdfalyzer.binary.binary_scanner import BinaryScanner
from pdfalyzer.output.character_mapping import print_character_mapping, print_prepared_charmap
from pdfalyzer.output.tables.font_summary_table import font_summary_table
from pdfalyzer.output.layout import print_section_subheader
from pdfalyzer.output.styles.node_colors import get_label_style
from pdfalyzer.util.adobe_strings import (FONT, FONT_DESCRIPTOR, FONT_FILE, FONT_LENGTHS, RESOURCES,
SUBTYPE, TO_UNICODE, TYPE, W, WIDTHS)
FONT_SECTION_PREVIEW_LEN = 30
class FontInfo:
@classmethod
def extract_font_infos(cls, obj_with_resources: PdfObject) -> ['FontInfo']:
"""
Extract all the fonts from a given /Resources PdfObject node.
obj_with_resources must have '/Resources' because that's what _cmap module expects
"""
resources = obj_with_resources[RESOURCES]
if isinstance(resources, IndirectObject):
resources = resources.get_object()
fonts = resources.get(FONT)
if fonts is None:
log.info(f'No fonts found in {obj_with_resources}')
return []
fonts = fonts.get_object()
return [cls.build(label, font, obj_with_resources) for label, font in fonts.items()]
@classmethod
def build(cls, label: str, font_ref: IndirectObject, obj_with_resources) -> 'FontInfo':
"""Build a FontInfo object from a IndirectObject ref to a /Font"""
font_obj = font_ref.get_object()
font_descriptor = None
font_file = None
if font_obj.get(TYPE) != FONT:
raise TypeError(f"{TYPE} of {font_ref} is not {FONT}")
if FONT_DESCRIPTOR in font_obj:
font_descriptor = font_obj[FONT_DESCRIPTOR].get_object()
font_file_keys = [k for k in font_descriptor.keys() if FONT_FILE in k]
# There's /FontFile, /FontFile2, etc. but whichever it is there should be only one
if len(font_file_keys) > 1:
raise RuntimeError(f"Too many /FontFile keys in {font_descriptor}: {font_file_keys}")
elif len(font_file_keys) == 0:
log.info(f"No font_file found in {font_descriptor}")
else:
font_file = font_descriptor[font_file_keys[0]].get_object()
return cls(label, font_ref.idnum, font_obj, font_descriptor, font_file, obj_with_resources)
def __init__(self, label, idnum, font, font_descriptor, font_file, obj_with_resources):
self.label = label
self.idnum = idnum
self.font_file = font_file
self.descriptor = font_descriptor
# /Font attributes
self.font = font
self.sub_type = font.get(SUBTYPE)
self.widths = font.get(WIDTHS) or font.get(W)
if isinstance(self.widths, IndirectObject):
self.widths = self.widths.get_object()
self.base_font = font.get('/BaseFont')
self.first_and_last_char = [font.get('/FirstChar'), font.get('/LastChar')]
self.display_title = f"{self.idnum}. Font {self.label} "
if self.sub_type is None:
log.warning(f"Font type not given for {self.display_title}")
self.display_title += "(UNKNOWN FONT TYPE)"
else:
self.display_title += f"({self.sub_type[1:]})"
# FontDescriptor attributes
if font_descriptor is not None:
self.bounding_box = font_descriptor.get('/FontBBox')
self.flags = font_descriptor.get('/Flags')
else:
self.bounding_box = None
self.flags = None
# /FontFile attributes
if font_file is not None:
self.lengths = [font_file[k] for k in FONT_LENGTHS if k in font_file]
self.stream_data = font_file.get_data()
self.advertised_length = sum(self.lengths)
scanner_label = Text(self.display_title, get_label_style(FONT_FILE))
self.binary_scanner = BinaryScanner(self.stream_data, self, scanner_label)
self.prepared_char_map = prepare_cm(font) if TO_UNICODE in font else None
# TODO: shouldn't we be passing ALL the widths?
self._char_map = build_char_map(label, self.widths[0], obj_with_resources)
try:
self.character_mapping = self._char_map[3]
except (IndexError, TypeError):
log.warning(f"Exception trying to get character mapping for {self}")
self.character_mapping = []
else:
self.lengths = None
self.stream_data = None
self.advertised_length = None
self.binary_scanner = None
self.prepared_char_map = None
self._char_map = None
self.character_mapping = None
def width_stats(self):
if self.widths is None:
return {}
return {
'min': min(self.widths),
'max': max(self.widths),
'count': len(self.widths),
'unique_count': len(set(self.widths)),
}
def print_summary(self):
"""Prints a table of info about the font drawn from the various PDF objects. quote_type of None means all."""
print_section_subheader(str(self), style='font.title')
console.print(font_summary_table(self))
console.line()
print_character_mapping(self)
print_prepared_charmap(self)
console.line()
# TODO: currently unused
def preview_bytes_at_advertised_lengths(self):
"""Show the bytes at the boundaries provided by /Length1, /Length2, and /Length3, if they exist"""
lengths = self.lengths or []
if self.lengths is None or len(lengths) <= 1:
console.print("No length demarcations to preview.", style='grey.dark')
for i, demarcation in enumerate(lengths[1:]):
console.print(f"{self.font_file} at /Length{i} ({demarcation}):")
print(f"\n Stream before: {self.stream_data[demarcation - FONT_SECTION_PREVIEW_LEN:demarcation + 1]}")
print(f"\n Stream after: {self.stream_data[demarcation:demarcation + FONT_SECTION_PREVIEW_LEN]}")
print(f"\nfinal bytes back from {self.stream_data.lengths[2]} + 10: {self.stream_data[-10 - -f.lengths[2]:]}")
def __str__(self) -> str:
return self.display_title