-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlatex_format_prova.py
1252 lines (1162 loc) · 73.9 KB
/
latex_format_prova.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 28 10:42:48 2023
@author: Roger Balsach
"""
from collections.abc import Iterable
import pathlib
import functools
import re
import sys
from typing import Optional, Union
import numpy as np
separate_list = [r'\\quad', r'\\qquad']
arrow_list = [r'\\to', r'\\xrightarrow(\[.*?\])?\{.*?\}', r'\\Longrightarrow']
equal_list = ['=', r'\\equiv', r'\\cong', r'\\neq']
# A backslash is allowed after these characters without space
backslash_prefactor = [r'\(', r'\[', r'\{', r'\s', r'\$', r'\^', r'\\', r'\-',
r'\|', '_', '%']
context_list = [r'\\label', r'\\text']
def_list = [r'\def', r'\newcommand', r'\renewcommand']
### Define regex patterns:
pattern_separate = re.compile('|'.join(separate_list))
pattern_arrow = re.compile('|'.join(arrow_list))
pattern_equal = re.compile('|'.join(equal_list))
pattern_backslash = re.compile(
rf'[^{"".join(backslash_prefactor)}]\\(?!right)'
)
pattern_context = re.compile(rf'({"|".join(context_list)})$')
ParenthesisType = dict[
tuple[
Union[
tuple[Optional[int], int],
tuple[int, Optional[int]]
],
str
],
'ParenthesisType'
]
class Parenthesis():
OPEN_PARENTHESIS = '([{'
CLOSE_PARENTHESIS = ')]}'
def __init__(self) -> None:
self.current_struct: ParenthesisType = {}
self.parenthesis_structure: ParenthesisType = {}
def add_open_brace(self, idx: int, char: str) -> None:
self.levels.append((idx, char))
self.current_struct[((idx, None), char)] = {}
self.current_struct = self.current_struct[((idx, None), char)]
@classmethod
def get_match(cls, char: str) -> str:
if char in cls.OPEN_PARENTHESIS:
return cls.CLOSE_PARENTHESIS[cls.OPEN_PARENTHESIS.index(char)]
elif char in cls.CLOSE_PARENTHESIS:
return cls.OPEN_PARENTHESIS[cls.CLOSE_PARENTHESIS.index(char)]
elif char == '$':
return char
raise ValueError()
@classmethod
def match(cls, first: str, second: str) -> bool:
return first == cls.get_match(second)
def process_end_equation(self, char: str, idx: int, start: int) -> bool:
if char != '$':
raise ValueError(
f'Parenthesis mismatch in line: {self.line}'
)
elif self.is_escaped(idx):
return False
self.in_equation = False
return True
def process_not_match(self, idx: int, char: str, start: int
) -> tuple[int, str]:
if char == ')':
# Assume that ) is not part of a parenthesis
return start, ''
# Check its not a phantom context
while self.levels:
start, schar = self.levels.pop()
if schar == '{' and not self.is_escaped(start):
break
else:
raise Exception(
f'Parenthesis not well written: {self.line}'
)
if char == '}':
return start, schar
self.levels.append((start, schar))
return start, ''
def process_unmatched(self, unmatched_parenthesis: str, char: str) -> bool:
schar = unmatched_parenthesis[-1]
if not self.match(schar, char):
if char == ')':
return True
raise Exception(
f'Parenthesis not well written in line:\n'
f'{self.line}\n'
f'Expected: {self.get_match(schar)}, '
f'Found: {char}'
)
if char == '$':
self.in_equation = False
return False
def parse(self, line: str, unmatched_parenthesis: str) -> ParenthesisType:
self.line = line
self.in_equation = '$' in unmatched_parenthesis
self.parenthesis_structure = {}
self.levels: list[tuple[int, str]] = []
self.current_struct = self.parenthesis_structure
for idx, char in enumerate(line):
if char in self.OPEN_PARENTHESIS:
self.add_open_brace(idx, char)
elif char == '$' and not self.in_equation:
if self.is_escaped(idx):
continue
self.add_open_brace(idx, '$')
self.in_equation = True
elif char in self.CLOSE_PARENTHESIS + '$':
if self.levels:
start, schar = self.levels.pop()
if schar == '$':
valid = self.process_end_equation(char, idx, start)
if not valid:
self.levels.append((start, '$'))
continue
elif not self.match(char, schar):
self.levels.append((start, schar))
start, schar = self.process_not_match(idx, char, start)
if not schar:
continue
elif char == '}':
if self.is_escaped(idx):
idx -= 1
else:
if unmatched_parenthesis:
escaped = self.process_unmatched(unmatched_parenthesis,
char)
if escaped:
continue
unmatched_parenthesis = unmatched_parenthesis[:-1]
elif char == ')':
continue
elif char == '}':
if self.is_escaped(idx):
idx -= 1
schar = self.get_match(char)
self.parenthesis_structure = {
((None, idx), schar): self.parenthesis_structure
}
self.current_struct = self.parenthesis_structure
continue
self.current_struct = self.update_structure(start, schar, idx)
while self.levels:
start, schar = self.levels.pop()
self.current_struct = self.update_structure(start, schar, None)
return self.parenthesis_structure
def is_escaped(self, idx: int) -> bool:
return idx > 0 and self.line[idx - 1] == '\\'
def update_structure(self, start: int, schar: str, idx: Optional[int]
) -> ParenthesisType:
parent_structure = self.parenthesis_structure
for _idx, char in self.levels:
parent_structure = parent_structure[(_idx, None), char]
parent_structure.pop(((start, None), schar))
parent_structure[((start, idx), schar)] = self.current_struct
return parent_structure
# TODO: Add CLI interface
# TODO: Implement read from file properly
class TeXFormatter:
def __init__(self, content: Union[str, list[str]]) -> None:
if isinstance(content, str):
content = content.splitlines(keepends=True)
self.init_string = content.copy()
self.reset_context()
self.multline_parenthesis = ''
self.indent = ''
format_content = self._format_spaces(content)
self.formatted_lines = self.format_tex(format_content, first=True)
@property
def context(self) -> str:
return self._context[-1]
def update_context(self, line: str) -> None:
if any(x in line for x in def_list):
return
# if self.context == 'text' and '$' in self.multline_parenthesis:
# self._context.append('equation')
if r'\begin' in line:
if 'equation' in line or 'align' in line or 'eqnarray' in line:
self._context.append('equation')
elif 'document' in line or 'figure' in line:
self._context.append('text')
elif 'verbatim' in line:
self._context.append('verbatim')
else:
from warnings import warn
warn(f'unknown environment: {line}')
self._context.append(self._context[-1])
elif r'\beq' in line:
self._context.append('equation')
if r'\end' in line:
self._context.pop()
elif r'\eeq' in line:
self._context.pop()
def reset_context(self) -> None:
self._context = ['text']
def _format_spaces(self, lines: list[str]) -> list[str]:
for i, line in enumerate(lines):
# Replate all tabs by spaces
line = line.expandtabs(4)
# Calculate the indent of the line, remove spaces in the beginning
# of the line.
if line.lstrip().startswith('%%% '):
# Emacs local variable definition.
indent = 0
cmt = '%%% '
elif line.lstrip().startswith('%%%%'):
# Long comment. Keep it as is
continue
elif line.lstrip().startswith('%'):
# Line is a comment
indent = (len(line[1:]) - len(line.lstrip(' %'))) // 4 * 4
cmt = '%'
else:
indent = (len(line) - len(line.lstrip())) // 4 * 4
cmt = ''
self.indent = cmt + ' ' * indent
self.update_context(line)
if self.context == 'verbatim':
continue
line = self.indent + line.lstrip(' %')
# Remove double spaces (except for the indent)
while ' ' in line[indent:]:
line = self.indent + line.lstrip(' %').replace(' ', ' ')
# Make sure all the commas are followed by a space, except for ,~
# and footnotes
line = re.sub(r',(?!\s|~|\\footnote)', r', ', line)
# Move "begin" commands to a new line.
# TODO: The check for "def" commands should be better.
if (r'\begin' in line.strip(' %')[6:]
and all(x not in line for x in def_list)):
idx = line.index(r'\begin')
if not ((match := re.search(r'(?<!\\)%', line))
and match.start() < idx):
new_line = self.indent + line[idx:]
line = line[:idx]
lines.insert(i+1, new_line)
offset = len(self.indent)
self.indent = ''
if self.context == 'equation':
add_space = self._equation_addspace(line.lstrip(' %'), offset)
elif self.context == 'text':
add_space = self._text_addspace(line.lstrip(' %'), offset)
# Add all the spaces found previously
for space_pos in sorted(set(add_space), reverse=True):
line = line[:space_pos] + ' ' + line[space_pos:]
lines[i] = line.rstrip() + '\n'
self.reset_context()
return lines
def _equation_addspace(self, line: str, offset: int = 0) -> list[int]:
# Find position that need space
add_space = []
self.indent = ' ' * (len(line) - len(line.lstrip(' %')))
skeleton, parenthesis = self.get_skeleton(line)
# Add a space before '\' except when following ( [ { $ ^ \ or a space
# or except when prefacing "right".
if '\\' in skeleton:
for match in pattern_backslash.finditer(skeleton):
if re.search(r'(\\left|\\right|\\[bB]igg?)\s*$',
skeleton[:match.start() + 1]):
continue
add_space.append(self.get_index_line(match.start(0), line) + 1)
# Add a space before '&' except when following \ or a space.
if '&' in skeleton:
for match in re.finditer(r'[^\\\s]&', skeleton):
add_space.append(self.get_index_line(match.start(0), line) + 1)
# Add a space before and after the + - and / operations.
if ('+' in skeleton or '-' in skeleton or '/' in skeleton
or '=' in skeleton):
add_space.extend(
[self.get_index_line(idx, line)
for idx in self._format_spaces_operation(skeleton)]
)
# Add a space after ).
if ')' in skeleton or ']' in skeleton or '}' in skeleton:
for match in re.finditer(r'[\)\]\}][A-Za-z0-9]', skeleton):
add_space.append(self.get_index_line(match.start(0), line) + 1)
# Add a space after super and underscript.
if '_' in skeleton or '^' in skeleton:
for match in re.finditer(r'[_^]\w[A-Za-z0-9]', skeleton):
add_space.append(self.get_index_line(match.end(), line) - 1)
# Add a space after digits.
for match in re.finditer(r'\d[A-Za-z]', skeleton):
add_space.append(self.get_index_line(match.end(), line) - 1)
self.indent = ''
for (start, end), char in parenthesis:
start = -1 if start is None else start
if start < 1:
self.indent = ''
if char == '{':
if pattern_context.search(line[:start]):
add_space.extend(self._text_addspace(line[start+1:end],
start+1))
continue
elif char == '$':
if end is None:
self._context.pop()
continue
add_space.extend(self._equation_addspace(line[start+1:end],
start+1))
return [offset + n for n in add_space]
def _text_addspace(self, line: str, offset: int = 0) -> list[int]:
add_space = []
list_parenthesis = [
self._find_parenthesis(line, self.multline_parenthesis)
].copy()
while list_parenthesis:
parenthesis = list_parenthesis.pop()
for ((start, end), char), par in parenthesis.items():
if char != '$':
list_parenthesis.append(par)
continue
assert start is not None
add_space.extend(
self._equation_addspace(line[start+1:end], offset=start+1)
)
if end is None:
self._context.append('equation')
return [offset + n for n in add_space]
def _format_spaces_operation(self, line: str, offset: int = 0
) -> list[int]:
add_space = []
# Add a space before an operation, unless preceeded
# by a parenthesis or exponent (^)
for match in re.finditer(r'([^\s\(\[\{\^])[\+\-/=\<\>]', line):
if not match.group(1) == ' ':
add_space.append(offset + match.start(1) + 1)
else:
assert False
# Add a space after an operation if not preceded by parenthesis
# or followed by EOL.
for match in re.finditer(r'[^\{\(\[](\+|/|=|\\neq|\-|\<)(?!\s|$)',
line):
add_space.append(offset + match.end())
return add_space
def format_tex(self, lines: Iterable[str], first: bool = False
) -> list[str]:
# TODO: Fix split parenthesis from multiple lines
new_content = []
for line in map(str.rstrip, lines):
# print(line)
# Detect when we are inside an environment
self.update_context(line)
# Compute the indent of the line
self.indent = ' ' * (len(line) - len(line.lstrip()))
self.commentafter = len(line)
# TODO: Manage block separators of thef form %%%% TEXT %%%%
if set(line.strip()) == {'%'}:
new_content.append(line[:79] + '\n')
continue
# TODO: Compute indent from scratch from previous lines.
if line.strip().startswith('%'):
level = len(line) - len(line.lstrip(' %'))
self.indent = '%' + ' ' * (level - 1)
elif (match := re.search(r'(?<!\\)%(?!$)', line)):
self.commentafter = match.start()
if line.replace('$$', '$').count('$') % 2 == 1:
if self.context == 'text':
line_wo_comment = line.partition('%')[0].strip()
if not line_wo_comment:
# Line is a comment
line_wo_comment = line.partition('%')[-1].strip()
if (('$' not in self.multline_parenthesis
or line_wo_comment.strip()[0] != '$')
and line_wo_comment[-1] != '$'):
assert '$' not in self.multline_parenthesis, line
idx = line[::-1].find('$')
new_content.extend(self.format_tex([line[:-idx]]))
self._context.append('equation')
new_content.extend(self.format_tex(
[self.indent + 4*' ' + line[-idx:].lstrip()]
))
continue
elif self.context == 'equation':
idx = line.find('$')
if idx > len(self.indent):
new_content.extend(self.format_tex([line[:idx]]))
self._context.pop()
# if '$' in self.multline_parenthesis:
# assert self.multline_parenthesis[-1] == '$'
# self.multline_parenthesis = self.multline_parenthesis[:-1]
new_content.extend(self.format_tex([self.indent
+ line[idx:]]))
continue
# If line is shoft enough, leave it as it is
if len(line) <= 80 or self.context == 'verbatim':
if not first and self.context == 'equation':
# TODO: If previous lines were splitted, check that all
# TODO: operations here have lower priority. For example,
# TODO: if the previous line splitted sums, and this line
# TODO: contains some sums, we should split them also.
# If there are unmatched parenthesis, split the line anyway
ret = self.check_unmatched_parenthesis(line)
if ret is not None:
new_content.extend(ret)
continue
self.update_multiline_parenthesis(line)
new_content.append(line + '\n')
continue
# Format the line according to the actual context
try:
if self.context == 'text':
new_content.extend(self._format_text(line))
elif self.context == 'equation':
new_content.extend(self._format_equation(line))
except Exception as e:
print(type(e))
print(line)
sys.exit(1)
# Combine the lines to avoid lines too short
if not first:
new_content = self.combine_lines(new_content)
return new_content
def combine_lines(self, content: list[str]) -> list[str]:
index_mask = [True,] * (len(content) - 1)
while len(content) > 1:
lengths = np.asarray(list(map(len, content)))
comb_len = lengths[1:] + lengths[:-1]
valid_comb = comb_len[(comb_len <= 80) & index_mask]
if not valid_comb.size:
break
# Substitute for?:
# idx = np.where(comb_len == min(valid_comb))[0][0]
# assert index_mask[idx] is True
indices = np.where(comb_len == min(valid_comb))[0]
for idx in indices:
if index_mask[idx]:
break
else:
assert False
first = content[idx]
second = content[idx + 1]
assert first, first
assert second, second
if not self.allow_combine(first, second):
index_mask[idx] = False
continue
content.pop(idx + 1)
space = ' '
# why does the first need to be alnum?
# if not first[-1].isalnum() and second[0] in {'.', ','}:
if (second.lstrip(' %')[0] in {'.', ','}
and first.strip()[-1] != ','
or first.strip()[-1] == '%' and first.strip()[-2] != '\\'):
space = ''
match = re.search(r'^(.*?)(\s|(?<!\\)%)*$', first)
if match is None:
raise ValueError()
first = match.group(1)
content[idx] = first + space + second.lstrip(' %')
index_mask.pop(idx)
return content
def allow_combine(self, first: str, second: str) -> bool:
if re.search(r'(?:\w{3,}|\W)\.$', first.strip(' %')):
return False
first = first.strip()
second = second.strip()
if (first[0] == '%') ^ (second[0] == '%'):
return False
first = first.strip(' %')
second = second.strip(' %')
if first == '$' or second.count('$') == 1:
return False
if pattern_equal.match(second):
return False
if pattern_separate.match(second) or pattern_separate.match(first):
return False
if pattern_arrow.match(second) or pattern_arrow.match(first):
return False
_first = first.endswith
_second = second.startswith
if _second('+') or _second('-'):
return False
if _first('(') or _first('[') or _first('{'):
return False
if _second(')') or _second(']') or _second('}') or _second(r'\}'):
return False
if _first('\\left(') or _first('\\left[') or _first('\\left\\{'):
return False
if _second('\\right)') or _second('\\right]') or _second('\\right\\}'):
return False
# if _first.strip()[-1] == '%' and _first.strip()[-2] != '\\':
# return True
return True
def line_split(self, line: str, pattern: Union[str, re.Pattern],
keep: Union[bool, str] = False) -> list[str]:
if not isinstance(pattern, re.Pattern):
pattern = re.compile(pattern)
skeleton, _ = self.get_skeleton(line, self.multline_parenthesis)
lines = []
prev_idx = 0
for match in pattern.finditer(skeleton):
start = self.get_index_line(match.start(), line)
end = self.get_index_line(match.end(), line)
sEOL = eEOL = '%' if self.context == 'text' else ''
if end == len(line) or line[end] == ' ':
eEOL = ''
if line[start-1] == ' ' or line[start] == ' ':
sEOL = ''
if keep == 'first':
lines.append(line[prev_idx:end] + eEOL)
prev_idx = end
elif keep == 'second':
lines.append(line[prev_idx:start] + sEOL)
prev_idx = start
elif keep is True:
lines.append(line[prev_idx:start] + sEOL)
lines.append(line[start:end] + eEOL)
prev_idx = end
elif keep is False:
lines.append(line[prev_idx:start] + sEOL)
prev_idx = end
lines.append(line[prev_idx:])
lines = [line for line in lines if line.strip(' %')]
new_lines = map(lambda s: self.indent + s.lstrip(' %').rstrip() + '\n',
lines)
return self.format_tex(new_lines)
def get_index_line(self, idx: int, line: str) -> int:
idx_l = len(self.indent) + idx
parenthesis = self._find_parenthesis(line, self.multline_parenthesis)
for (start, end), _ in parenthesis:
if start is None:
assert end is not None
idx_l = end + idx
continue
if start >= idx_l:
break
assert end is not None, line
idx_l += end - start - 1
return idx_l
def _format_text(self, line: str) -> list[str]:
skeleton, parenthesis = self.get_skeleton(line,
self.multline_parenthesis)
if self.commentafter < len(line):
new_lines = [line[:self.commentafter+1], line[self.commentafter:]]
return self.format_tex(new_lines)
# Split newlines into a new line
pattern = re.compile(r'(\\\\|\\newline)(?=.)')
if pattern.search(skeleton):
return self.line_split(line, pattern, keep='first')
# Split sentences (separated by . or ?) into multiple lines
pattern = re.compile(r'.[\.\?](?=\s[A-Z{])')
if pattern.search(skeleton[:-1]):
return self.line_split(line, pattern, keep='first')
# Split the line by ':'
elif re.search(r':\W', skeleton[:-1]):
return self.line_split(line, ':', keep='first')
# Split the line by ','
elif ',' in skeleton[:-1]:
return self.line_split(line, ',', keep='first')
# Split the line by ' and '
elif ' and ' in skeleton[:-1]:
return self.line_split(line, r'(?<=\s)and(?=\s)', keep='second')
# Split the formulas into a new line.
new_lines = []
if skeleton == '$$':
start = self.get_index_line(0, line)
end = self.get_index_line(1, line)
new_lines.append(line[:start+1].rstrip() + '\n')
self._context.append('equation')
indent = self.indent
new_lines.extend(self.format_tex(
[indent + 4 * ' ' + line[start+1:end].lstrip()]
))
self._context.pop()
# TODO?: Add % after last $.
new_lines.append(indent + line[end:].strip() + '\n')
return [line for line in new_lines if line]
if ' $$' in skeleton:
return self.line_split(line, r'\s\$\$', keep=True)
# Split {} into multiple lines
for (_start, _end), char in parenthesis:
start = _start+1 if _start is not None else 0
end = _end if _end is not None else len(line)
if end - start > 40 and char == '{':
pass
elif end - start > 75 and char in '([':
pass
else:
continue
# Decide wether to put comment at the end of the line or not.
EOL1 = EOL2 = '%'
if (line[start] == ' '
and (char != '{' or start > 2 and line[start-2] != '\\')):
EOL1 = ''
if _end is None and line.strip()[-1] != '%':
EOL2 = ''
new_lines.append(self.indent + line[:start].lstrip(' %') + EOL1)
new_lines.append(
self.indent + 4 * ' ' + line[start:end].lstrip() + EOL2
)
new_lines.append(self.indent + line[end:].lstrip())
new_lines = [line for line in new_lines if line.strip(' %')]
if new_lines:
return self.format_tex(new_lines)
if ' ' in skeleton:
return self.line_split(line, ' ', keep=False)
return [line]
def _format_equation(self, line: str) -> list[str]:
skeleton, _ = self.get_skeleton(line, self.multline_parenthesis)
# Split label into own line
if r'\label' in line:
return self.line_split(line, r'\\label', keep='second')
# If equation separator (quad), split line.
if pattern_separate.search(skeleton):
return self.line_split(line, pattern_separate, keep=True)
# Split line in implication
if pattern_arrow.search(skeleton):
return self.line_split(line, pattern_arrow, keep=True)
# Split line in equality
if pattern_equal.search(skeleton[1:]):
return self.line_split(line, pattern_equal, keep='second')
# If unmatched parenthesis, split right after/before.
ret = self.check_unmatched_parenthesis(line)
if ret is not None:
return ret
# Split sums and subtractions into multiple lines
ret = self.split_sums(line)
if ret is not None:
return ret
# Split parenthesis into multiple lines if they are big enough.
ret = self.split_large_parenthesis(line)
if ret is not None:
return ret
# Split the spaces of skeleton
if ' ' in skeleton:
return self.line_split(line, r'(?<!\\)\s')
# If the parenthesis are not big enough, split the line right
# after a parenthesis
# TODO: There is no test for this line!
ret = self.split_after_parenthesis(line)
if ret is not None:
return ret
raise NotImplementedError(f'line "{line}" not splitted.')
def split_sums(self, line: str) -> Optional[list[str]]:
skeleton, _ = self.get_skeleton(line, self.multline_parenthesis)
new_lines = []
prev_idx = 0
# TODO: Handle cases like \cong - 3, etc.
# This should be done easier with new python 3.11 re functions.
for match in re.finditer(r'[^=\s&]\s*(\+|\-)', skeleton):
idx_s = match.start(1)
idx_l = self.get_index_line(idx_s, line)
new_lines.append(self.indent + line[prev_idx:idx_l].lstrip(' %'))
prev_idx = idx_l
if new_lines:
new_lines.append(self.indent + line[idx_l:].lstrip())
return self.format_tex(new_lines)
def check_unmatched_parenthesis(self, line: str) -> Optional[list[str]]:
parenthesis = self._find_parenthesis(line, self.multline_parenthesis)
for (start, end), _ in parenthesis:
if start is None and end > len(self.indent):
if (match := re.search(r'\\((right|[bB]igg?)\s?\\?)?$',
line[:end])):
end = match.start()
if end == len(self.indent):
continue
new_lines = [
self.indent + 4*' ' + line[:end].strip('% ') + '\n',
self.indent + line[end:].rstrip() + '\n'
]
elif end is None and start + 2 < len(line.rstrip()):
new_lines = [
line[:start+1].rstrip() + '\n',
self.indent + 4*' ' + line[start+1:].strip() + '\n'
]
else:
continue
return self.format_tex(new_lines)
def split_large_parenthesis(self, line: str) -> Optional[list[str]]:
parenthesis = self._find_parenthesis(line, self.multline_parenthesis)
for (start, end), _ in parenthesis:
if end is None or start is None or end - start < 30:
continue
if pattern_context.search(line[:start]):
indent = self.indent
new_lines = [line[:start + 1] + '%\n']
self._context.append('text')
new_lines += self.format_tex(
[self.indent + 4*' ' + line[start+1:end].lstrip() + '%\n']
)
self._context.pop()
new_lines += [indent + line[end:] + '\n']
return self.format_tex(new_lines)
if (match := re.search(r'\\(right|[bB]igg?)\s?\\?$', line[:end])):
end = match.start()
new_lines = [
line[:start + 1] + '\n',
self.indent + 4*' ' + line[start+1:end].lstrip() + '\n',
self.indent + line[end:] + '\n'
]
return self.format_tex(new_lines)
def split_after_parenthesis(self, line: str) -> Optional[list[str]]:
parenthesis = self._find_parenthesis(line, self.multline_parenthesis)
for (start, end), char in reversed(parenthesis):
if end > 80:
continue
elif char == '{' and end + 1 < len(line) and line[end+1] == '{':
continue
new_lines = [line[:end + 1] + '\n',
self.indent + line[end+1:].strip() + '\n']
return self.format_tex(new_lines)
def update_multiline_parenthesis(self, line: str) -> None:
if 'phantom' in line:
return
open_p = '([{'
close_p = ')]}'
parenthesis = self._find_parenthesis(line,
self.multline_parenthesis).copy()
new_parenthesis = []
while parenthesis:
((start, end), char), child = parenthesis.popitem()
if start is None:
new_parenthesis.append((end, Parenthesis.get_match(char)))
elif end is None:
new_parenthesis.append((start, char))
else:
continue
parenthesis |= child
sent = True
for _, char in sorted(new_parenthesis):
if char in close_p:
assert sent, line
assert self.multline_parenthesis, line
_char = self.multline_parenthesis[-1]
assert open_p.index(_char) == close_p.index(char), line
self.multline_parenthesis = self.multline_parenthesis[:-1]
continue
elif char == '$':
if '$' not in self.multline_parenthesis:
assert line.rstrip()[-1] == '$'
self.multline_parenthesis += '$'
self._context.append('equation')
else:
assert self.multline_parenthesis[-1] == '$'
self.multline_parenthesis = self.multline_parenthesis[:-1]
continue
sent = False
if char in open_p:
self.multline_parenthesis += char
@classmethod
@functools.cache
def get_skeleton(cls, line: str, unmatched_parenthesis: str = ''
) -> tuple[str, ParenthesisType]:
parenthesis = cls._find_parenthesis(line, unmatched_parenthesis)
skeleton = line.lstrip(' %')
offset = len(line) - len(skeleton)
for (start, end), _ in parenthesis:
if end is None:
skeleton = skeleton[:start-offset+1]
break
elif start is None:
skeleton = skeleton[end-offset:]
offset = end
else:
skeleton = skeleton[:start-offset+1] + skeleton[end-offset:]
offset += end - start - 1
return skeleton.strip(' %'), parenthesis
@staticmethod
@functools.cache
def _find_parenthesis(line: str, unmatched_parenthesis: str = ''
) -> ParenthesisType:
return Parenthesis().parse(line, unmatched_parenthesis)
def __repr__(self) -> str:
if self.formatted_lines == self.init_string:
return 'String not modified'
return ''.join(self.formatted_lines)
def __eq__(self, other: object) -> bool:
if isinstance(other, str):
return repr(self) == other
return NotImplemented
def format_files(*names: str | pathlib.PosixPath) -> None:
pathdir = pathlib.Path.cwd()
for name in names:
filename = pathdir / pathlib.Path(name)
print(filename)
dest = pathdir / pathlib.Path(name)
with filename.open() as file:
content = file.readlines()
# import pdb; pdb.set_trace()
try:
new_content = TeXFormatter(content).formatted_lines
except Exception as e:
err_msg = 'Exception encountered when formatting the file '
err_msg += f'{filename}: {e}'
raise type(e)(err_msg).with_traceback(e.__traceback__) from None
with dest.open('w') as file:
file.writelines(new_content)
def format_string() -> None:
s = r'''
distribution functions and $d \hat \si^{\rm(res)}_{ij\tosv \tth}/ dQ^2 \left. \right|_{\scriptscriptstyle({\rm NNLO})}$ represents the perturbative expansion of the resummed cross section truncated at NNLO. The inverse Mellin transform (\ref{invmel}) is evaluated numerically using according to the ``Minimal Prescription'' ~\cite{Catani:1996yz}
'''
r = TeXFormatter(s)
print(r)
if __name__ == '__main__':
try:
__IPYTHON__
format_string()
except NameError:
_, *names = sys.argv
format_files(*names)
### TESTS ###
# 5
s = r'''
\cite{CMS:2014wdm,CMS:2017odg,ATLAS:2017ztq,CMS:2018fdh,ATLAS:2018mme,ATLAS:2018kot,ATLAS:2018ynr},
'''
assert TeXFormatter(s) == '\n\\cite{%\n CMS:2014wdm, CMS:2017odg, ATLAS:2017ztq, CMS:2018fdh,\n ATLAS:2018mme, ATLAS:2018kot, ATLAS:2018ynr%\n},\n'
# 5
s = r'''
\title{State-of-the-art cross sections for $\boldsymbol{t\bar t H}$:\\ NNLO predictions matched with NNLL resummation and EW corrections}
'''
assert TeXFormatter(s) == '\n\\title{%\n State-of-the-art cross sections for $\\boldsymbol{t \\bar t H}$:\\\\\n NNLO predictions matched with NNLL resummation and EW corrections%\n}\n'
# 6
s = r'''
\begin{itemize}
\item $\muF =\muR = m_t+m_H/2$
\item $\muF =\muR =H_T/2$
\item $\muF =\muR \equiv Q/2$ ($Q\equiv M_{ttH}$)
\end{itemize}
'''
assert TeXFormatter(s) == '\n\\begin{itemize}\n\\item $\\muF = \\muR = m_t + m_H / 2$\n\\item $\\muF = \\muR = H_T / 2$\n\\item $\\muF = \\muR \\equiv Q / 2$ ($Q \\equiv M_{ttH}$)\n\\end{itemize}\n'
# 6
s = r'''
In particular, we are interested in evolving $S$ from the scale where renormalization takes place $\mu_0=\mu_R$ to the soft scale $\mu = Q\bar{N}^{-1}$. Where $\bar{N}=Ne^{\gamma_E}$ as defined in \cite{Kulesza17}. Then
'''
assert TeXFormatter(s) == '\nIn particular, we are interested in evolving $S$\nfrom the scale where renormalization takes place\n$\\mu_0 = \\mu_R$ to the soft scale $\\mu = Q \\bar{N}^{-1}$.\nWhere $\\bar{N} = Ne^{\\gamma_E}$ as defined in \\cite{Kulesza17}.\nThen\n'
# 6
s = r'''
\def\cCode#1{\begin{lstlisting}[mathescape,basicstyle=\small
\ttfamily,frame=leftline,aboveskip=4mm,belowskip=4mm,xleftmargin=20pt,framexleftmargin=10pt,
numbers=none,framerule=2pt,abovecaptionskip=0.0mm,belowcaptionskip=3.5mm #1]}
'''
assert TeXFormatter(s) == '\n\\def\\cCode#1{\\begin{lstlisting}[mathescape, basicstyle=\\small\n\\ttfamily, frame=leftline, aboveskip=4mm, belowskip=4mm,\nxleftmargin=20pt, framexleftmargin=10pt,\nnumbers=none, framerule=2pt, abovecaptionskip=0.0mm, belowcaptionskip=3.5mm #1]}\n'
#6
s = r'''
where $\left.\sigma_{\rm NNLL}^{\rm SCET}\right|_{\alphas^2}$ ($\left.\sigma_{\rm NNLL}^{\rm dQCD}\right|_{\alphas^2}$) is
the expansion of $ \sigma_{\rm NNLL}^{\rm SCET}$ ($ \sigma_{\rm NNLL}^{\rm dQCD}$) up to order $\alphas^2$.\\
The two resummed predictions are combined by simply considering their average:
'''
assert TeXFormatter(s) == '\nwhere $\\left. \\sigma_{\\rm NNLL}^{\\rm SCET}\\right|_{\\alphas^2}$\n($\\left. \\sigma_{\\rm NNLL}^{\\rm dQCD}\\right|_{\\alphas^2}$) is\nthe expansion of $ \\sigma_{\\rm NNLL}^{\\rm SCET}$\n($ \\sigma_{\\rm NNLL}^{\\rm dQCD}$) up to order $\\alphas^2$.\\\\\nThe two resummed predictions are combined by simply considering their average:\n'
# 8
s = r'''
Now, equating both derivatives and using equation \eqref{eq:Kmatrix} for $U$ we arrive at the following equation:
\begin{equation*}
\frac{\Gamma(\alpha_S)}{\beta(\alpha_S)}K(\alpha_S)
= \dv{K(\alpha_S)}{\alpha_S} - K(\alpha_S)\frac{\Gamma^{(1)}}{2\pi\alpha_Sb_0}
\end{equation*}
'''
assert TeXFormatter(s) == '\nNow, equating both derivatives and using equation \\eqref{eq:Kmatrix} for\n$U$ we arrive at the following equation:\n\\begin{equation*}\n \\frac{\\Gamma(\\alpha_S)}{\\beta(\\alpha_S)} K(\\alpha_S)\n = \\dv{K(\\alpha_S)}{\\alpha_S}\n - K(\\alpha_S) \\frac{\\Gamma^{(1)}}{2 \\pi \\alpha_S b_0}\n\\end{equation*}\n'
# 10
s = r'''
\begin{align}
\label{eq:SCET_11}
(\mu_F,\mu_R,\mu_h) \in \{&(S,S,S),(2S,S,2S),(2S,S,S),(S/2,S,S/2),(S/2,S,S),(S,2 S,S),\,\nonumber\\
& (S,2 S,2 S),(S,S/2,S/2),(S,S/2,S),(2 S,2 S,2 S),(S/2,S/2,S/2)\} \, .
\end{align}
'''
assert TeXFormatter(s) == '\n\\begin{align}\n\\label{eq:SCET_11}\n (\\mu_F, \\mu_R, \\mu_h) \\in \\{\n &(S, S, S), (2 S, S, 2 S), (2 S, S, S), (S / 2, S, S / 2),\n (S / 2, S, S), (S, 2 S, S), \\, \\nonumber \\\\\n & (S, 2 S, 2 S), (S, S / 2, S / 2),\n (S, S / 2, S), (2 S, 2 S, 2 S), (S / 2, S / 2, S / 2)\n \\} \\, .\n\\end{align}\n'
# 13
s = r'''
\begin{equation*}
-b_0\alpha_S(\mu_0^2)\log(\frac{\mu^2}{\mu_0^2}) = \alpha_S(\mu_0^2)\frac{b_1}{b_0}\log(\frac{\frac{\alpha_S(\mu_0^2)}{\alpha_S(\mu^2)}+ \alpha_S(\mu_0^2)\frac{b_1}{b_0}}{1+\alpha_S(\mu_0^2) \frac{b_1}{b_0}})+1-\frac{\alpha_S(\mu_0^2)}{\alpha_S(\mu^2)} + \order{\alpha_S^2}
\end{equation*}
'''
assert TeXFormatter(s) == '\n\\begin{equation*}\n -b_0 \\alpha_S(\\mu_0^2) \\log(\\frac{\\mu^2}{\\mu_0^2})\n = \\alpha_S(\\mu_0^2) \\frac{b_1}{b_0} \\log(\n \\frac{\n \\frac{\\alpha_S(\\mu_0^2)}{\\alpha_S(\\mu^2)}\n + \\alpha_S(\\mu_0^2) \\frac{b_1}{b_0}\n }{1 + \\alpha_S(\\mu_0^2) \\frac{b_1}{b_0}}\n )\n + 1\n - \\frac{\\alpha_S(\\mu_0^2)}{\\alpha_S(\\mu^2)}\n + \\order{\\alpha_S^2}\n\\end{equation*}\n'
# 13
s = r'''
where $f_{i / h}(x, \muF^2)$ are moments of the parton
distribution functions and $ d \hat \si^{\rm
(res)}_{ij\tosv \tth}/ dQ^2 \left.
\right|_{\scriptscriptstyle({\rm NNLO})}$ represents the perturbative expansion of the resummed cross section truncated at NNLO. The inverse Mellin transform (\ref{invmel}) is evaluated numerically using according to the ``Minimal Prescription'' ~\cite{Catani:1996 yz}
along a contour ${\sf C}$ in the complex-$N$ space.
'''
assert TeXFormatter(s) == "\nwhere $f_{i / h}(x, \\muF^2)$ are moments of the parton\ndistribution functions and $\n d \\hat \\si^{\n \\rm\n(res)}_{ij \\tosv \\tth} / dQ^2 \\left.\n\\right|_{\\scriptscriptstyle({\\rm NNLO})}\n$\nrepresents the perturbative expansion\nof the resummed cross section truncated at NNLO.\nThe inverse Mellin transform (\\ref{invmel}) is evaluated numerically\nusing according to the ``Minimal Prescription'' ~\\cite{Catani:1996 yz}\nalong a contour ${\\sf C}$ in the complex-$N$ space.\n"
# 14
s = '''
However, the consequence is that the non-radiative amplitude in the r.h.s. of eq. \\eqref{eq:NLP} is evaluated using the momenta $p$, which are unphysical for this process, because $\\sum \\eta_i p_i \\neq 0$. This might seem problematic because an amplitude is intrinsically defined for physical momenta, and it is not uniquely defined for unphysical momenta. Therefore, the value of $\\mathcal{H}(p)$ is ambiguous, which translates into an ambiguity on $\\mathcal{A}(p, k)$ and thus seems to invalidate eq. \\eqref{eq:NLP}. The argument, however, is not entirely correct, as shown in \\cite{Balsach:2023ema}. Indeed, although an ambiguity is present, it only affects the NNLP terms.
'''
assert TeXFormatter(s) == '\nHowever,\nthe consequence is that the non-radiative amplitude in the r.h.s. of eq.\n\\eqref{eq:NLP} is evaluated using the momenta $p$,\nwhich are unphysical for this process, because $\\sum \\eta_i p_i \\neq 0$.\nThis might seem problematic because an amplitude is\nintrinsically defined for physical momenta,\nand it is not uniquely defined for unphysical momenta.\nTherefore, the value of $\\mathcal{H}(p)$ is ambiguous,\nwhich translates into an ambiguity on $\\mathcal{A}(p, k)$\nand thus seems to invalidate eq. \\eqref{eq:NLP}.\nThe argument, however, is not entirely correct,\nas shown in \\cite{Balsach:2023ema}.\nIndeed, although an ambiguity is present, it only affects the NNLP terms.\n'
# 15
s = r'''
\begin{eqnarray}
g_s (N)
= \frac{1}{2 \pi b_0} \left\{
\log(1 - 2 \lambda) + \alphas(\muR^2) \left[
\frac{b_1}{b_0} \frac{ \log(1 - 2 \lambda)}{ 1 - 2 \lambda}
- 2 \gamma_{\rm E} b_0 \frac{2 \lambda}{1 - 2 \lambda} \right.
\right.
\nonumber \\
\left. \left.
+ \, b_0 \log \left( \frac{Q^2}{\muR^2} \right)
\frac{2 \lambda}{1 - 2 \lambda}
\right]
\right\}
\end{eqnarray}
'''
assert TeXFormatter(s) == 'String not modified'
# 17
s = r'''
\begin{figure}[ht!]
\includegraphics[width=0.49\textwidth]{tth_figxsec.pdf}
\includegraphics[width=0.49\textwidth]{tth_figerr.pdf}
\caption{\label{fig:xsecerr}{\bf Left panel:} the total cross section for $t\bar t H$, $\sigma_{\rm NNLO+NNLL+EW}$, and the relative impact of the different
contributions with respect to $\sigma_{\rm NLO}$. {\bf Right panel:} scale uncertainties computed for the cross section computed at different
accuracies. Solid lines display the total width of the scale-uncertainty band, while dashed line the maximum variation with respect to the central prediction.}
\end{figure}
'''
assert TeXFormatter(s) == '\n\\begin{figure}[ht!]\n \\includegraphics[width=0.49\\textwidth]{tth_figxsec.pdf}\n \\includegraphics[width=0.49\\textwidth]{tth_figerr.pdf}\n \\caption{%\n \\label{fig:xsecerr}{\\bf Left panel:} the total cross section for\n $t \\bar t H$, $\\sigma_{\\rm NNLO + NNLL + EW}$,\n and the relative impact of the different\n contributions with respect to $\\sigma_{\\rm NLO}$.\n {\\bf Right panel:} scale uncertainties\n computed for the cross section computed at different\n accuracies.\n Solid lines display the total width of the scale-uncertainty band,\n while dashed line the maximum\n variation with respect to the central prediction.%\n}\n\\end{figure}\n'
#18
s = r'''
%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}
\centering
\includegraphics[width=.49\textwidth]{tth_nnll_comp_qcd_scet.pdf}
\includegraphics[width=.49\textwidth]{tth_nnllnnlo_lhc136_asym.pdf}
\caption{Left: comparison between NNLO+NNLL results in dQCD and SCET for three parametrically different choices of the default
scales. Right: comparison of the combined NNLO+NNLL results with NNLO, for the same three sets of scales. No EW corrections are included. See the text for
additional explanations on the estimation of the uncertainties. }
\label{fig:tth_comparisons}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%
'''
assert TeXFormatter(s) == '\n%%%%%%%%%%%%%%%%%%%%%%%%%\n\\begin{figure}\n \\centering\n\\includegraphics[width=.49\\textwidth]{tth_nnll_comp_qcd_scet.pdf}\n\\includegraphics[width=.49\\textwidth]{tth_nnllnnlo_lhc136_asym.pdf}\n \\caption{%\n Left: comparison between NNLO+NNLL results in dQCD\n and SCET for three parametrically different choices of the default\n scales.\n Right: comparison of the combined NNLO+NNLL results with NNLO,\n for the same three sets of scales.\n No EW corrections are included.\n See the text for\n additional explanations on the estimation of the uncertainties. }\n \\label{fig:tth_comparisons}\n\\end{figure}\n%%%%%%%%%%%%%%%%%%%%%%%%%%\n'