forked from aploium/zmirror
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMagicWebsiteMirror.py
1754 lines (1468 loc) · 72.3 KB
/
MagicWebsiteMirror.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# coding=utf-8
import os
if os.path.dirname(__file__) != '':
os.chdir(os.path.dirname(__file__))
import traceback
import pickle
from datetime import datetime, timedelta
import re
import base64
import zlib
from time import time
from fnmatch import fnmatch
from html import escape as html_escape
import threading
from urllib.parse import urljoin, urlsplit, urlunsplit, quote_plus
import requests
from flask import Flask, request, make_response, Response, redirect
from ColorfulPyPrint import * # TODO: Migrate logging tools to the stdlib
try:
from cchardet import detect as c_chardet
except:
cchardet_available = False
else:
cchardet_available = True
try:
from fastcache import lru_cache
infoprint('lru_cache loaded from fastcache')
except:
from functools import lru_cache
warnprint('package fastcache not found, fallback to stdlib lru_cache. '
'Considering install it using "pip3 install fastcache"')
try:
from config_default import *
except:
warnprint('the config_default.py is missing, this program may not works normally\n'
'config_default.py 文件丢失, 这会导致配置文件不向后兼容, 请重新下载一份 config_default.py')
try:
from config import *
except:
warnprint(
'the config_default.py is missing, fallback to default configs(if we can), '
'please COPY the config_default.py to config.py, and change it\'s content, '
'or use the configs in the more_config_examples folder\n'
'自定义配置文件 config.py 丢失, 将使用默认设置, 请将 config_default.py 复制一份为 config.py, '
'并根据自己的需求修改里面的设置'
'(或者使用 more_config_examples 中的配置文件)'
)
if local_cache_enable:
try:
from cache_system import FileCache, get_expire_from_mime
cache = FileCache(max_size_kb=8192)
except Exception as e:
errprint('Can Not Create Local File Cache: ', e, ' local file cache is disabled automatically.')
local_cache_enable = False
__VERSION__ = '0.20.9-dev'
__author__ = 'Aploium <[email protected]>'
# ########## Basic Init #############
ColorfulPyPrint_set_verbose_level(verbose_level)
my_host_name_no_port = my_host_name
if my_host_port is not None:
my_host_name += ':' + str(my_host_port)
my_host_name_urlencoded = quote_plus(my_host_name)
else:
my_host_name_urlencoded = my_host_name
static_file_extensions_list = set(static_file_extensions_list)
external_domains_set = set(external_domains or [])
allowed_domains_set = external_domains_set.copy()
allowed_domains_set.add(target_domain)
domain_alias_to_target_set = set()
domain_alias_to_target_set.add(target_domain)
domains_alias_to_target_domain = list(domains_alias_to_target_domain)
if domains_alias_to_target_domain:
for _domain in domains_alias_to_target_domain:
allowed_domains_set.add(_domain)
domain_alias_to_target_set.add(_domain)
domains_alias_to_target_domain.append(target_domain)
else:
domains_alias_to_target_domain = [target_domain]
my_host_scheme_escaped = my_host_scheme.replace('/', r'\/')
myurl_prefix = my_host_scheme + my_host_name
myurl_prefix_escaped = myurl_prefix.replace('/', r'\/')
cdn_domains_number = len(CDN_domains)
allowed_remote_response_headers = {
'content-type', 'date', 'expires', 'cache-control', 'last-modified', 'server', 'location',
'accept-ranges',
'access-control-allow-origin', 'access-control-allow-headers', 'access-control-allow-methods',
'access-control-expose-headers', 'access-control-max-age', 'access-control-allow-credentials',
'timing-allow-origin',
}
allowed_remote_response_headers.update(custom_allowed_remote_headers)
# ## Get Target Domain and MyHostName's Root Domain ##
temp = target_domain.split('.')
if len(temp) <= 2 or len(temp) == 3 and temp[1] in ('com', 'net', 'org', 'co', 'edu', 'mil', 'gov', 'ac'):
target_domain_root = target_domain
else:
target_domain_root = '.'.join(temp[1:])
temp = my_host_name.split('.')
if len(temp) <= 2 or len(temp) == 3 and temp[1] in ('com', 'net', 'org', 'co', 'edu', 'mil', 'gov', 'ac'):
my_host_name_root = target_domain
else:
my_host_name_root = '.'.join(temp[1:])
# ## thread local var ##
request_local = threading.local()
request_local.start_time = None
request_local.cur_mime = ''
request_local.cache_control = ''
request_local.temporary_domain_alias = None
# ########## Handle dependencies #############
if not enable_static_resource_CDN:
mime_based_static_resource_CDN = False
disable_legacy_file_recognize_method = True
if not mime_based_static_resource_CDN:
cdn_redirect_code_if_cannot_hard_rewrite = 0 # record incoming urls if we should use cdn on it
url_to_use_cdn = {}
if not cdn_redirect_code_if_cannot_hard_rewrite:
cdn_redirect_encode_query_str_into_url = False
if not local_cache_enable:
cdn_redirect_encode_query_str_into_url = False
if not isinstance(target_static_domains, set):
target_static_domains = set()
if not enable_stream_content_transfer:
steamed_mime_keywords = ()
if not url_custom_redirect_enable:
url_custom_redirect_list = {}
url_custom_redirect_regex = ()
shadow_url_redirect_regex = ()
plain_replace_domain_alias = ()
if not enable_automatic_domains_whitelist:
domains_whitelist_auto_add_glob_list = tuple()
if not enable_individual_sites_isolation:
isolated_domains = set()
else:
for isolated_domain in isolated_domains:
if isolated_domain not in external_domains_set:
warnprint('An isolated domain:', isolated_domain,
'would not have effect because it did not appears in the `external_domains` list')
if enable_custom_access_cookie_generate_and_verify:
human_ip_verification_whitelist_from_cookies = False
if not is_use_proxy:
requests_proxies = None
if human_ip_verification_enabled:
import ipaddress
buff = []
for network in human_ip_verification_default_whitelist_networks:
buff.append(ipaddress.ip_network(network, strict=False))
human_ip_verification_default_whitelist_networks = tuple(buff)
for question in human_ip_verification_questions:
human_ip_verification_answers_hash_str += question[1]
else:
identity_verify_required = False
human_ip_verification_whitelist_from_cookies = False
must_verify_cookies = False
if not human_ip_verification_whitelist_from_cookies and not enable_custom_access_cookie_generate_and_verify:
must_verify_cookies = False
url_rewrite_cache = {} # an VERY Stupid and VERY Experimental Cache
url_rewrite_cache_hit_count = 0
url_rewrite_cache_miss_count = 0
# ########### PreCompile Regex ###############
# Advanced url rewriter, see function response_text_rewrite()
# #### 这个正则表达式是整个程序的最核心的部分, 它的作用是从 html/css/js 中提取出长得类似于url的东西 ####
# 如果需要阅读这个表达式, 请一定要在IDE(如PyCharm)的正则高亮下阅读
# 这个正则并不保证匹配到的东西一定是url, 在 regex_url_reassemble() 中会进行进一步验证是否是url
regex_adv_url_rewriter = re.compile( # TODO: Add non-standard port support
# 前缀, 必须有 'action='(表单) 'href='(链接) 'src=' 'url('(css) '@import'(css) '":'(js/json, "key":"value")
# \s 表示空白字符,如空格tab
r"""(?P<prefix>\b((action|href|src)\s*=|url\s*\(|@import\s*|"\s*:)\s*)""" + # prefix, eg: src=
# 左边引号, 可选 (因为url()允许没有引号). 如果是url以外的, 必须有引号且左右相等(在重写函数中判断, 写在正则里可读性太差)
r"""(?P<quote_left>["'])?""" + # quote "'
# 域名和协议头, 可选. http:// https:// // http:\/\/ (json) https:\/\/ (json) \/\/ (json)
r"""(?P<domain_and_scheme>(?P<scheme>(https?:)?\\?/\\?/)(?P<domain>([-a-z0-9]+\.)+[a-z]+))?""" + # domain and scheme
# url路径, 含参数 可选
r"""(?P<path>[^\s;+$?#'"\{}]*?""" + # full path(with query string) /foo/bar.js?love=luciaZ
# url中的扩展名, 仅在启用传统的根据扩展名匹配静态文件时打开
(r"""(\.(?P<ext>[-_a-z0-9]+?))?""" if not disable_legacy_file_recognize_method else '') + # file ext
# 查询字符串, 可选
r"""(?P<query_string>\?[^\s?#'"]*?)?)""" + # query string ?love=luciaZ
# 右引号(可以是右括弧), 必须
r"""(?P<quote_right>["'\)])(?P<right_suffix>\W)""", # right quote "'
flags=re.IGNORECASE
)
regex_extract_base64_from_embedded_url = re.compile(
r'_mwm0(?P<gzip>z?)_\.(?P<b64>[a-zA-Z0-9-_]+=*)\._mwm1_\.[a-zA-Z\d]+\b')
# Response Cookies Rewriter, see response_cookie_rewrite()
regex_cookie_rewriter = re.compile(r'\bdomain=(\.?([\w-]+\.)+\w+)\b', flags=re.IGNORECASE)
# Request Domains Rewriter, see client_requests_text_rewrite()
if my_host_port is not None:
temp = r'(' + re.escape(my_host_name) + r'|' + re.escape(my_host_name_no_port) + r')'
else:
temp = re.escape(my_host_name)
regex_request_rewriter = re.compile(
temp + r'(/|(%2F))extdomains(/|(%2F))(https-)?(?P<origin_domain>\.?([\w-]+\.)+\w+)\b',
flags=re.IGNORECASE)
# Flask main app
app = Flask(__name__)
# ###################### Functional Tests ####################### #
# 0. test environment
# 0.0 global search keyword: lovelive ,scholar keyword: gravity
# 0.1 Firefox/46.0 Windows/10 x64
#
# 1. www.google.com load [OK]
# 1.0 basic [OK]
# 1.1 search hint [OK]
#
# 2. webpage search [OK]
# 2.0 basic [OK]
# 2.1 search result page 2,3 [OK]
# 2.2 search tools [OK]
# 2.3 result item click [OK]
# 2.3.0 basic [OK]
# 2.3.1 result item (left) click, with redirect [OK]
# 2.3.2 result item (right) click, with top banner [OK]
# 2.4 search item cache [Not Supported Yet]
#
# 3. image search [OK]
# 3.0 basic [OK]
# 3.1 all images lazy load [OK]
# 3.2 image detail banner [OK]
# 3.2.0 basic [OK]
# 3.2.1 HD lazy load [OK]
# 3.2.2 relative images show [OK]
# 3.2.3 relative images click/HD lazy load [OK]
# 3.2.4 view image page [OK]
# 3.2.5 view raw image (ps: raw image may be blocked by GFW, thus NOT accessible) [OK]
# 3.3 scroll down lazy load [OK]
#
# 5. google scholar (/scholar)
# 5.0 basic [OK]
# 5.1 search (gravity) [OK]
# 5.1.0 basic [OK]
# 5.1.1 result item click and redirect [OK]
# 5.1.2 citations click [OK]
# 5.1.3 search filters ("Since year 2015") [OK]
#
# 6. video search (ps: DO NOT support youtube) [OK]
# 6.0 basic [OK]
# 6.1 video thumb show [OK]
# 6.2 result item click redirect [OK]
# 6.3 page 2,3 [OK]
#
# ########## Begin Utils #############
def calc_domain_replace_prefix(_domain):
return dict(
# normal
slash='//' + _domain,
http='http://' + _domain,
https='https://' + _domain,
double_quoted='"%s"' % _domain,
single_quoted="'%s'" % _domain,
# escape slash
slash_esc=('//' + _domain).replace('/', r'\/'),
http_esc=('http://' + _domain).replace('/', r'\/'),
https_esc=('https://' + _domain).replace('/', r'\/'),
# urlencoded
slash_ue=quote_plus('//' + _domain),
http_ue=quote_plus('http://' + _domain),
https_ue=quote_plus('https://' + _domain),
double_quoted_ue=quote_plus('"%s"' % _domain),
single_quoted_ue=quote_plus("'%s'" % _domain),
# escaped and urlencoded
slash_esc_ue=quote_plus(('//' + _domain).replace('/', r'\/')),
http_esc_ue=quote_plus(('http://' + _domain).replace('/', r'\/')),
https_esc_ue=quote_plus(('https://' + _domain).replace('/', r'\/')),
)
def add_temporary_domain_alias(source_domain, target_domain):
if request_local.temporary_domain_alias is None:
request_local.temporary_domain_alias = []
else:
request_local.temporary_domain_alias = list(request_local.temporary_domain_alias)
request_local.temporary_domain_alias.append((source_domain, target_domain))
request_local.temporary_domain_alias = tuple(request_local.temporary_domain_alias)
dbgprint('A domain', source_domain, 'to', target_domain, 'added to temporary_domain_alias',
request_local.temporary_domain_alias)
@lru_cache(maxsize=8192)
def is_domain_match_glob_whitelist(domain):
for domain_glob in domains_whitelist_auto_add_glob_list:
if fnmatch(domain, domain_glob):
return True
return False
@lru_cache(maxsize=128)
def is_content_type_streamed(content_type):
for streamed_keyword in steamed_mime_keywords:
if streamed_keyword in content_type:
return True
return False
def try_match_and_add_domain_to_rewrite_white_list(domain, force_add=False):
global external_domains, external_domains_set, allowed_domains_set, prefix_buff
if domain is None or not domain:
return False
if domain in allowed_domains_set:
return True
if not force_add and not is_domain_match_glob_whitelist(domain):
return False
else:
infoprint('A domain:', domain, 'was added to whitelist')
_buff = list(external_domains)
_buff.append(domain)
external_domains = tuple(_buff)
external_domains_set.add(domain)
allowed_domains_set.add(domain)
prefix_buff[domain] = calc_domain_replace_prefix(domain)
# write log
try:
with open('automatic_domains_whitelist.log', 'a', encoding='utf-8') as fp:
fp.write(domain + '\n')
except:
traceback.print_exc()
return True
def current_line_number():
"""Returns the current line number in our program."""
import inspect
return inspect.currentframe().f_back.f_lineno
@lru_cache(maxsize=8192)
def extract_real_url_from_embedded_url(embedded_url):
"""
eg: https://cdn.domain.com/a.php_mwm0_.cT1zb21ldGhpbmc=._mwm1_.css
---> https://foo.com/a.php?q=something (assume it returns an css) (base64 only)
eg2: https://cdn.domain.com/a/b/_mwm0_.bG92ZT1saXZl._mwm1_.jpg
---> https://foo.com/a/b/?love=live (assume it returns an jpg) (base64 only)
eg3: https://cdn.domain.com/a/b/_mwm0z_.[some long long base64 encoded string]._mwm1_.jpg
---> https://foo.com/a/b/?love=live[and a long long query string] (assume it returns an jpg) (gzip + base64)
eg4:https://cdn.domain.com/a (no change)
---> (no query string): https://foo.com/a (assume it returns an png) (no change)
:param embedded_url: embedded_url
:return: real url or None
"""
if '._mwm1_.' not in embedded_url[-15:]: # check url mark
return None
m = regex_extract_base64_from_embedded_url.search(embedded_url)
b64 = get_group('b64', m)
if not b64:
return None
# 'https://cdn.domain.com/a.php_mwm0_.cT1zb21ldGhpbmc=._mwm1_.css'
# real_request_url_no_query ---> 'https://cdn.domain.com/a.php'
real_request_url_no_query = embedded_url[:m.span()[0]]
try:
query_string_byte = base64.urlsafe_b64decode(b64)
is_gzipped = get_group('gzip', m)
if is_gzipped:
query_string_byte = zlib.decompress(query_string_byte)
query_string = query_string_byte.decode(encoding='utf-8')
except:
traceback.print_exc()
return None
result = urljoin(real_request_url_no_query, '?' + query_string)
# dbgprint('extract:', embedded_url, 'to', result)
return result
@lru_cache(maxsize=4096)
def embed_real_url_to_embedded_url(real_url_raw, url_mime, escape_slash=False):
# dbgprint(real_url_raw, url_mime, escape_slash)
if escape_slash:
real_url = real_url_raw.replace(r'\/', '/')
else:
real_url = real_url_raw
url_sp = urlsplit(real_url)
if not url_sp.query: # no query, needn't rewrite
return real_url_raw
try:
byte_query = url_sp.query.encode()
if len(byte_query) > 128:
gzip_label = 'z'
byte_query = zlib.compress(byte_query)
else:
gzip_label = ''
b64_query = base64.urlsafe_b64encode(byte_query).decode()
# dbgprint(url_mime)
mixed_path = url_sp.path + '_mwm0' + gzip_label + '_.' + b64_query + '._mwm1_.' + mime_to_use_cdn[url_mime]
result = urlunsplit((url_sp.scheme, url_sp.netloc, mixed_path, '', ''))
except:
traceback.print_exc()
return real_url_raw
else:
if escape_slash:
result = result.replace('/', r'\/')
# dbgprint('embed:', real_url_raw, 'to:', result)
return result
def extract_from_url_may_have_extdomains(extdomains_url=None):
"""[http://foo.bar]/extdomains/foobar.com/path --> ('foboar.com', False), JSON supported. return:(real_domain, is_https, real_path)
:rtype: (real_domain, is_https, real_path)
"""
if extdomains_url is None:
extdomains_url = request.path
extdomains_pos = extdomains_url.find('extdomains')
if extdomains_pos != -1:
# 10 == len('extdomains')
if extdomains_url[extdomains_pos + 10] == '\\':
domain_end_pos = extdomains_url.find('\\', extdomains_pos + 12)
real_domain = extdomains_url[extdomains_pos + 12:domain_end_pos]
real_domain.replace('\\.', '.')
else:
domain_end_pos = extdomains_url.find('/', extdomains_pos + 11)
real_domain = extdomains_url[extdomains_pos + 11:domain_end_pos]
remote_path = extdomains_url[domain_end_pos:]
if real_domain[:6] == 'https-':
real_domain = real_domain[6:]
is_https = True
else:
is_https = False
# dbgprint(extdomains_url,'extract_from_url_may_have_extdomains', real_domain, is_https)
return real_domain, is_https, remote_path
else:
return target_domain, target_scheme == 'https://', extdomains_url
def get_ext_domain_inurl_scheme_prefix(ext_domain):
if force_https_domains == 'NONE':
return ''
if force_https_domains == 'ALL':
return 'https-'
if ext_domain in force_https_domains:
return 'https-'
else:
return ''
def add_ssrf_allowed_domain(domain):
global allowed_domains_set
allowed_domains_set.add(domain)
def set_request_for_debug(dummy_request):
global request
request = dummy_request
def strx(*args, sep=' '):
output = ''
for arg in args:
output += str(arg) + sep
output.rstrip(sep)
return output
@lru_cache(maxsize=1024)
def check_global_ua_pass(ua_str):
if ua_str is None:
return False
ua_str = ua_str.lower()
if global_ua_white_name in ua_str:
return True
else:
return False
@lru_cache(maxsize=128)
def is_mime_represents_text(input_mime):
"""
Determine whether an mime is text (eg: text/html: True, image/png: False)
:param input_mime: str
:return: bool
"""
input_mime_l = input_mime.lower()
for text_word in text_like_mime_keywords:
if text_word in input_mime_l:
return True
return False
@lru_cache(maxsize=128)
def extract_mime_from_content_type(content_type):
c = content_type.find(';')
if c == -1:
return content_type
else:
return content_type[:c]
@lru_cache(maxsize=128)
def is_content_type_using_cdn(content_type):
mime = extract_mime_from_content_type(content_type)
if mime in mime_to_use_cdn:
# dbgprint(content_type, 'Should Use CDN')
return mime
else:
# dbgprint(content_type, 'Should NOT CDN')
return False
@lru_cache(maxsize=256)
def is_ua_in_whitelist(ua_str):
"""
:type ua_str: str
"""
ua_str = ua_str.lower()
if global_ua_white_name in ua_str:
return True
for allowed_ua in spider_ua_white_list:
if allowed_ua in ua_str:
return True
return False
def generate_simple_resp_page(errormsg=b'We Got An Unknown Error', error_code=500):
return make_response(errormsg, error_code)
def generate_html_redirect_page(target_url, msg='', delay_sec=1):
resp_content = r"""<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>重定向 (Page Redirect)</title>
<meta http-equiv="refresh" content="%d; url=%s">
<script>setTimeout(function(){location.href="%s"} , %d000);</script>
</head>
<body>
<pre>%s</pre>
<hr />
You are now redirecting to <a href="%s">%s</a>, if it didn't redirect automatically, please click that link.
</body>
</html>""" % (
delay_sec, html_escape(target_url), html_escape(target_url), delay_sec + 1,
html_escape(msg), html_escape(target_url), html_escape(target_url)
)
resp_content = resp_content.encode('utf-8')
return Response(response=resp_content)
@lru_cache(maxsize=32)
def generate_304_response(content_type=None):
r = Response(content_type=content_type, status=304)
r.headers.add('X-Cache', 'FileHit-304')
return r
def generate_ip_verify_hash(input_dict):
strbuff = human_ip_verification_answers_hash_str
for key in input_dict:
strbuff += key + input_dict[key]
input_key_hash = hex(zlib.adler32(strbuff.encode(encoding='utf-8')))[2:]
output_hash = hex(zlib.adler32(
(input_key_hash + human_ip_verification_answers_hash_str).encode(encoding='utf-8')
))[2:]
return input_key_hash + output_hash
@lru_cache(maxsize=2048)
def verify_ip_hash_cookie(hash_cookie_value):
"""
:type hash_cookie_value: str
"""
try:
input_key_hash = hash_cookie_value[:8]
output_hash = hash_cookie_value[8:]
calculated_hash = hex(zlib.adler32(
(input_key_hash + human_ip_verification_answers_hash_str).encode(encoding='utf-8')
))[2:]
if output_hash == calculated_hash:
return True
else:
return False
except:
return False
def put_response_to_local_cache(url, our_resp, req, remote_resp):
"""
put our response object(headers included) to local cache
:param url: client request url
:param our_resp: our response(flask response object) to client, would be storged
:param req: the flask request object
:param remote_resp: the requests request object (the one returned by send_request() )
:return: None
"""
# Only cache GET method, and only when remote returns 200(OK) status
if local_cache_enable and req.method == 'GET' and remote_resp.status_code == 200:
# the header's character cases are different in flask/apache(win)/apache(linux)
content_type = remote_resp.headers.get('content-type', '') or remote_resp.headers.get('Content-Type', '')
last_modified = remote_resp.headers.get('last-modified', None) or remote_resp.headers.get('Last-Modified', None)
cache.put_obj(
url,
our_resp,
expires=get_expire_from_mime(extract_mime_from_content_type(content_type)),
obj_size=len(remote_resp.content),
last_modified=last_modified,
info_dict={'content-type': content_type, # storge extra info for future use
'last-modified': last_modified
},
)
def try_get_cached_response(url, client_header):
"""
:param url: real url with query string
:type client_header: dict
"""
# Only use cache when client use GET
if local_cache_enable and request.method == 'GET' and cache.is_cached(url):
if 'if-modified-since' in client_header and \
cache.is_unchanged(url, client_header.get('if-modified-since', None)):
cached_info = cache.get_info(url)
dbgprint('FileCacheHit-304', cached_info, url)
return generate_304_response()
else:
dbgprint('FileCacheHit-200')
resp = cache.get_obj(url)
assert isinstance(resp, Response)
resp.headers.add('X-Cache', 'FileHit')
return resp
else:
return None
def get_group(name, match_obj):
"""return a blank string if the match group is None
"""
try:
obj = match_obj.group(name)
except:
return ''
else:
if obj is not None:
return obj
else:
return ''
def regex_url_reassemble(match_obj):
"""
Reassemble url parts split by the regex.
:param match_obj: match object of stdlib re
:return: re assembled url string (included prefix(url= etc..) and suffix.)
"""
if match_obj.group() in url_rewrite_cache: # Read Cache
global url_rewrite_cache_hit_count
url_rewrite_cache_hit_count += 1
return url_rewrite_cache[match_obj.group()]
else:
global url_rewrite_cache_miss_count
prefix = get_group('prefix', match_obj)
quote_left = get_group('quote_left', match_obj)
quote_right = get_group('quote_right', match_obj)
path = get_group('path', match_obj)
match_domain = get_group('domain', match_obj)
scheme = get_group('scheme', match_obj)
whole_match_string = match_obj.group()
# dbgprint('prefix', prefix, 'quote_left', quote_left, 'quote_right', quote_right,
# 'path', path, 'match_domain', match_domain, 'scheme', scheme, 'whole', whole_match_string)
if r"\/" in path or r"\/" in scheme:
require_slash_escape = True
path = path.replace(r"\/", "/")
# domain_and_scheme = domain_and_scheme.replace(r"\/", "/")
else:
require_slash_escape = False
# path must be not blank
if (not path # path is blank
# only url(something) and @import are allowed to be unquoted
or ('url' not in prefix and 'import' not in prefix) and (not quote_left or quote_right == ')')
# for "key":"value" type replace, we must have at least one '/' in url path (for the value to be regard as url)
or (':' in prefix and '/' not in path)
# if we have quote_left, it must equals to the right
or (quote_left and quote_left != quote_right)
# in javascript, those 'path' contains one or only two slash, should not be rewrited (for potential error)
# or (request_local.cur_mime == 'application/javascript' and path.count('/') < 2)
# in javascript, we only rewrite those with explicit scheme ones.
or (('javascript' in request_local.cur_mime) and not scheme)
):
# dbgprint('returned_un_touch', whole_match_string)
return whole_match_string
# v0.19.0+ Automatic Domains Whitelist (Experimental)
if enable_automatic_domains_whitelist:
try_match_and_add_domain_to_rewrite_white_list(match_domain)
remote_domain, _is_remote_https, remote_path = extract_from_url_may_have_extdomains()
# dbgprint('remote_path:', remote_path, 'remote_domain:', remote_domain, 'match_domain', match_domain, v=5)
# dbgprint(match_obj.groups(), v=5)
# dbgprint('remote_path:', remote_path, 'remote_domain:', remote_domain, 'match_domain', match_domain, v=5)
domain = match_domain or remote_domain
# dbgprint('rewrite match_obj:', match_obj, 'domain:', domain, v=5)
# skip if the domain are not in our proxy list
if domain not in allowed_domains_set:
# dbgprint('return untouched because domain not match', domain, whole_match_string)
return match_obj.group() # return raw, do not change
# this resource's absolute url path to the domain root.
# dbgprint('match path', path, v=5)
path = urljoin(remote_path, path)
# dbgprint('middle path', path, v=5)
url_no_scheme = urljoin(domain + '/', path.lstrip('/'))
# dbgprint('url_no_scheme', url_no_scheme)
# add extdomains prefix in path if need
if domain in external_domains_set:
scheme_prefix = get_ext_domain_inurl_scheme_prefix(domain)
path = '/extdomains/' + scheme_prefix + url_no_scheme
# dbgprint('final_path', path, v=5)
if mime_based_static_resource_CDN and url_no_scheme in url_to_use_cdn:
# dbgprint('We Know:', url_no_scheme,v=5)
_we_knew_this_url = True
_this_url_mime_cdn = url_to_use_cdn[url_no_scheme][0]
else:
# dbgprint('We Don\'t know:', url_no_scheme,v=5)
_we_knew_this_url = False
_this_url_mime_cdn = False
# Apply CDN domain
if _this_url_mime_cdn \
or (not disable_legacy_file_recognize_method and get_group('ext', match_obj) in static_file_extensions_list):
# pick an cdn domain due to the length of url path
# an advantage of choose like this (not randomly), is this can make higher CDN cache hit rate.
# CDN rewrite, rewrite static resources to cdn domains.
# A lot of cases included, the followings are just the most typical examples.
# http(s)://target.com/img/love_lucia.jpg --> http(s)://your.cdn.domains.com/img/love_lucia.jpg
# http://external.com/css/main.css --> http(s)://your.cdn.domains.com/extdomains/external.com/css/main.css
# https://external.pw/css/main.css --> http(s)://your.cdn.domains.com/extdomains/https-external.pw/css/main.css
replace_to_scheme_domain = my_host_scheme + CDN_domains[zlib.adler32(path.encode()) % cdn_domains_number]
# else: # request_local.cur_mime == 'application/javascript':
# replace_to_scheme_domain = '' # Do not use explicit url prefix in js, to prevent potential error
elif not scheme:
replace_to_scheme_domain = ''
else:
replace_to_scheme_domain = myurl_prefix
reassembled_url = urljoin(replace_to_scheme_domain, path)
if _this_url_mime_cdn and cdn_redirect_encode_query_str_into_url:
reassembled_url = embed_real_url_to_embedded_url(
reassembled_url,
url_mime=url_to_use_cdn[url_no_scheme][1],
escape_slash=require_slash_escape
)
if require_slash_escape:
reassembled_url = reassembled_url.replace("/", r"\/")
# reassemble!
# prefix: src= quote_left: "
# path: /extdomains/target.com/foo/bar.js?love=luciaZ
reassembled = prefix + quote_left + reassembled_url + quote_right + get_group('right_suffix', match_obj)
# write the adv rewrite cache only if we disable CDN or we known whether this url is CDN-able
if not mime_based_static_resource_CDN or _we_knew_this_url:
url_rewrite_cache[match_obj.group()] = reassembled # write cache
url_rewrite_cache_miss_count += 1
# dbgprint('---------------------', v=5)
return reassembled
@lru_cache(maxsize=256)
def is_denied_because_of_spider(ua_str):
ua_str = ua_str.lower()
if 'spider' in ua_str or 'bot' in ua_str:
if is_ua_in_whitelist(ua_str):
dbgprint('A Spider/Bot\'s access was granted', ua_str)
return False
dbgprint('A Spider/Bot was denied, UA is:', ua_str)
return True
else:
return False
def load_ip_whitelist_file():
set_buff = set([])
if os.path.exists(human_ip_verification_whitelist_file_path):
with open(human_ip_verification_whitelist_file_path, 'r', encoding='utf-8') as fp:
set_buff.add(fp.readline().strip())
return set_buff
def append_ip_whitelist_file(ip_to_allow):
try:
with open(human_ip_verification_whitelist_file_path, 'a', encoding='utf-8') as fp:
fp.write(ip_to_allow + '\n')
except:
errprint('Unable to write whitelist file')
traceback.print_exc()
def ip_whitelist_add(ip_to_allow, info_record_dict=None):
if ip_to_allow in single_ip_allowed_set:
return
dbgprint('ip white added', ip_to_allow, 'info:', info_record_dict)
single_ip_allowed_set.add(ip_to_allow)
is_ip_not_in_allow_range.cache_clear()
append_ip_whitelist_file(ip_to_allow)
# dbgprint(single_ip_allowed_set)
try:
with open(human_ip_verification_whitelist_log, 'a', encoding='utf-8') as fp:
fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " " + ip_to_allow
+ " " + str(request.user_agent)
+ " " + repr(info_record_dict) + "\n")
except:
errprint('Unable to write log file', os.path.abspath(human_ip_verification_whitelist_log))
traceback.print_exc()
@lru_cache(maxsize=256)
def is_ip_not_in_allow_range(ip_address):
if ip_address in single_ip_allowed_set:
return False
ip_address_obj = ipaddress.ip_address(ip_address)
for allowed_network in human_ip_verification_default_whitelist_networks:
if ip_address_obj in allowed_network:
return False
return True
def convert_to_mirror_url(raw_url_or_path, remote_domain=None, is_scheme=None, is_escape=False):
"""
convert url from remote to mirror url
"""
if is_escape:
_raw_url_or_path = raw_url_or_path.replace('r\/', r'/')
else:
_raw_url_or_path = raw_url_or_path
sp = urlsplit(_raw_url_or_path)
if '/extdomains/' == sp.path[:12]:
return raw_url_or_path
domain = remote_domain or sp.netloc or extract_from_url_may_have_extdomains(request.path)[0] or target_domain
if domain not in allowed_domains_set:
return raw_url_or_path
if is_scheme or ((sp.scheme or _raw_url_or_path[:2] == '//') and is_scheme is not False):
our_prefix = myurl_prefix
else:
our_prefix = ''
if domain not in domain_alias_to_target_set:
remote_scheme = get_ext_domain_inurl_scheme_prefix(domain)
middle_part = '/extdomains/' + remote_scheme + domain
else:
middle_part = ''
result = urljoin(our_prefix + middle_part + '/',
extract_url_path_and_query(_raw_url_or_path).lstrip('/'))
if is_escape:
result = result.replace('/', r'\/')
return response_text_rewrite(result)
# ########## End utils ###############
# ################# Begin Server Response Handler #################
def iter_streamed_response(requests_response_obj):
total_size = 0
for particle_content in requests_response_obj.iter_content(stream_transfer_buffer_size):
if verbose_level >= 4:
total_size += len(particle_content)
dbgprint('total_size:', total_size)
yield particle_content
def copy_response(requests_response_obj, content=None, is_streamed=False):
"""
Copy and parse remote server's response headers, generate our flask response object
:type is_streamed: bool
:param requests_response_obj: remote server's response, requests' response object (only headers and status are used)
:param content: pre-rewrited response content, bytes
:return: flask response object
"""
if content is None:
if is_streamed:
dbgprint('Transfer Using Stream Mode:', requests_response_obj.url, request_local.cur_mime)
content = iter_streamed_response(requests_response_obj)
else:
content = response_content_rewrite(requests_response_obj)
if verbose_level >= 3: dbgprint('RemoteRespHeader', requests_response_obj.headers)
resp = Response(content, status=requests_response_obj.status_code)
for header_key in requests_response_obj.headers:
header_key_lower = header_key.lower()
# Add necessary response headers from the origin site, drop other headers
if header_key_lower in allowed_remote_response_headers:
if header_key_lower == 'location':
resp.headers[header_key] = convert_to_mirror_url(requests_response_obj.headers[header_key])
elif header_key_lower == 'content-type':
# force add utf-8 to content-type if it is text
if is_mime_represents_text(requests_response_obj.headers[header_key]) \
and 'utf-8' not in requests_response_obj.headers[header_key]:
resp.headers[header_key] = extract_mime_from_content_type(
requests_response_obj.headers[header_key]) + ';charset=utf-8'
else:
resp.headers[header_key] = requests_response_obj.headers[header_key]
elif header_key_lower in ('access-control-allow-origin', 'timing-allow-origin'):
resp.headers[header_key] = myurl_prefix
else:
resp.headers[header_key] = requests_response_obj.headers[header_key]
# If we have the Set-Cookie header, we should extract the raw ones
# and then change the cookie domain to our domain
if header_key_lower == 'set-cookie':
for cookie_string in response_cookies_deep_copy(requests_response_obj):
try:
resp.headers.add('Set-Cookie', response_cookie_rewrite(cookie_string))
except:
traceback.print_exc()
if verbose_level >= 3: dbgprint('OurRespHeaders:\n', resp.headers)
return resp
# noinspection PyProtectedMember
def response_cookies_deep_copy(req_obj):
"""
It's a BAD hack to get RAW cookies headers, but so far, we don't have better way.
We'd go DEEP inside the urllib's private method to get raw headers
raw_headers example:
[('Cache-Control', 'private'),
('Content-Length', '48234'),
('Content-Type', 'text/html; Charset=utf-8'),
('Server', 'Microsoft-IIS/8.5'),
('Set-Cookie','BoardList=BoardID=Show; expires=Mon, 02-May-2016 16:00:00 GMT; path=/'),
('Set-Cookie','aspsky=abcefgh; expires=Sun, 24-Apr-2016 16:00:00 GMT; path=/; HttpOnly'),
('Set-Cookie', 'ASPSESSIONIDSCSSDSSQ=OGKMLAHDHBFDJCDMGBOAGOMJ; path=/'),
('X-Powered-By', 'ASP.NET'),
('Date', 'Tue, 26 Apr 2016 12:32:40 GMT')]
:type req_obj: requests.models.Response
"""
raw_headers = req_obj.raw._original_response.headers._headers # PyCharm may raise an warning to this line
header_cookies_string_list = []
for name, value in raw_headers:
if name.lower() == 'set-cookie':
if my_host_scheme == 'http://':
value = value.replace('Secure;', '')
value = value.replace(';Secure', ';')
value = value.replace('; Secure', ';')
header_cookies_string_list.append(value)
return header_cookies_string_list
def response_content_rewrite(remote_resp_obj):
"""
Rewrite requests response's content's url. Auto skip binary (based on MIME).
:type remote_resp_obj: requests.models.Response
:param remote_resp_obj: requests response object
:return: bytes
"""