forked from tumashu/pyim
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pyim-cregexp.el
300 lines (257 loc) · 11.2 KB
/
pyim-cregexp.el
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
;;; pyim-cregexp.el --- Chinese regexp core tools for pyim. -*- lexical-binding: t; -*-
;; * Header
;; Copyright (C) 2021 Free Software Foundation, Inc.
;; Author: Feng Shu <[email protected]>
;; Maintainer: Feng Shu <[email protected]>
;; URL: https://github.com/tumashu/pyim
;; Keywords: convenience, Chinese, pinyin, input-method
;; This file is part of GNU Emacs.
;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;;; Code:
;; * 代码 :code:
(require 'cl-lib)
(require 'pyim-dcache)
(require 'pyim-imobjs)
(require 'pyim-pymap)
(require 'pyim-scheme)
(require 'rx)
(require 'xr)
(defgroup pyim-cregexp nil
"Chinese regexp tools for pyim."
:group 'pyim)
(defcustom pyim-cregexp-fallback-scheme 'quanpin
"`pyim-cregexp-build' 使用的 Fallback scheme.
如果 `pyim-cregexp-build' 无法支持用户正在使用的 scheme 时,
将使用这个 scheme."
:type 'pyim-scheme)
(defun pyim-cregexp-build (string &optional char-level-num chinese-only)
"根据 STRING 构建一个中文 regexp.
这个函数的功能和 `pyim-cregexp-build' 类似,大多数参数也相同,不
同点是这个函数没有 scheme 参数,它会根据 `pyim-default-scheme' 和
`pyim-cregexp-fallback-scheme' 等信息动态的获取 scheme."
(let ((scheme (pyim-cregexp--scheme)))
(pyim-cregexp-create string scheme char-level-num chinese-only)))
(defun pyim-cregexp--scheme (&optional scheme)
"返回一个支持 cregexp 的 scheme.
这个函数同时考虑 SCHEME, `pyim-default-scheme' 和
`pyim-cregexp-fallback-scheme'."
(or (pyim-cregexp--find-scheme scheme)
(pyim-cregexp--find-scheme pyim-default-scheme)
(pyim-cregexp--find-scheme pyim-cregexp-fallback-scheme)
(pyim-cregexp--find-scheme 'quanpin)))
(defun pyim-cregexp--find-scheme (scheme-or-name)
"如果 SCHEME-OR-NAME 支持 cregexp 功能,就返回对应的 scheme."
(let ((scheme (if (pyim-scheme-p scheme-or-name)
scheme-or-name
(pyim-scheme-get scheme-or-name))))
(when (and (pyim-scheme-p scheme)
(pyim-scheme-cregexp-support-p scheme))
scheme)))
(defun pyim-cregexp-create (string scheme &optional char-level-num chinese-only)
"根据 STRING 构建一个中文 regexp, 用于 \"拼音搜索汉字\".
比如:\"nihao\" -> \"[你呢...][好号...] \\| nihao\"
CHAR-LEVEL-NUM 代表汉字常用级别,pyim 中根据汉字的使用频率,将汉
字分为4个级别:1级最常用,4级别最不常用,1-3级汉字大概8000左右,
如果这个参数设置为3, 那么代表在构建 regexp 是,只使用常用级别小于
等于3的汉字。
如果 CHINESE-ONLY 为真,那么生成的 regexp 只能搜索汉字。
注意事项:如果生成的 regexp 太长,Emacs 无法处理,那么,这个命令
会抛弃一些不常用的汉字,重新生成,知道生成一个 Emacs 可以处理的
regexp, 所以搜索单字的时候一般可以搜到生僻字,但搜索句子的时候,
就无法搜索生僻字了。"
(if (and string scheme
(stringp string)
(> (length string) 0)
(pyim-scheme-p scheme)
(pyim-scheme-cregexp-support-p scheme))
(pyim-cregexp--create-valid-cregexp-from-string
string scheme char-level-num chinese-only)
string))
(defun pyim-cregexp--create-valid-cregexp-from-string
(string scheme &optional char-level-num chinese-only)
"从 STRING 创建一个有效的搜索中文的 regexp."
(let ((char-level-num
(pyim-cregexp--char-level-num char-level-num))
rx-string)
(while (and (not (pyim-cregexp--valid-p rx-string))
(> char-level-num 0))
(setq rx-string
(pyim-cregexp--create-beautiful-cregexp-from-string
string scheme char-level-num chinese-only))
(setq char-level-num (1- char-level-num)))
rx-string))
(defun pyim-cregexp--char-level-num (num)
"根据 NUM 返回一个有效的常用汉字级别。"
(if (numberp num)
(max (min num 4) 1)
4))
(defun pyim-cregexp--valid-p (cregexp)
"Return t when cregexp is a valid regexp."
(and cregexp
(stringp cregexp)
(not (pyim-cregexp--match-error-p cregexp))))
(defun pyim-cregexp--match-error-p (cregexp)
"Return t when an match error is signaled.
Emacs can't handle regexps whose length is too big :-("
(equal (condition-case nil
(string-match-p cregexp "")
(error 'error))
'error))
(defun pyim-cregexp--create-beautiful-cregexp-from-string
(string scheme &optional char-level-num chinese-only)
"使用 rx 和 xr, 从 STRING 生成一个漂亮的搜索中文的 regexp.
这个 regexp 可能正常使用,也可能长度超出 emacs 的限制。"
(or (ignore-errors
(rx-to-string
(pyim-cregexp--create-cregexp-from-rx
(lambda (x)
(if (stringp x)
(xr (pyim-cregexp--create-cregexp-from-string
x scheme char-level-num chinese-only))
x))
(xr string))))
string))
(defun pyim-cregexp--create-cregexp-from-rx (fn rx-form)
(pcase rx-form
('nil nil)
(`(,form) (funcall fn form))
(`(any . ,_) rx-form)
(`(,_ . ,_)
(mapcar (lambda (x)
(pyim-cregexp--create-cregexp-from-rx fn x))
rx-form))
(_ (funcall fn rx-form))))
(defun pyim-cregexp--create-cregexp-from-string
(string scheme &optional char-level-num chinese-only)
(let* ((char-level-num (pyim-cregexp--char-level-num char-level-num))
(string-list (pyim-cregexp--split-string string)))
;; 确保 pyim 词库加载
(pyim-dcache-init-variables)
(pyim-cregexp--create-cregexp-from-string-list
string-list scheme char-level-num chinese-only)))
(defun pyim-cregexp--split-string (string)
(let ((sep "#####&&&&#####"))
(remove "" (split-string
(replace-regexp-in-string
"\\([a-z]+'*\\)" (concat sep "\\1" sep) string)
sep))))
(defun pyim-cregexp--create-cregexp-from-string-list
(string-list scheme &optional char-level-num chinese-only)
(mapconcat
(lambda (string)
(if (or (pyim-string-match-p "[^a-z']+" string)
(equal string ""))
string
(let* ((string1 (replace-regexp-in-string "'" "" string))
(imobjs (pyim-imobjs-create string1 scheme))
(regexp-list
(mapcar (lambda (imobj)
(pyim-cregexp-create-from-imobj
imobj scheme nil nil nil char-level-num))
imobjs))
(regexp
(when regexp-list
(string-join (delq nil regexp-list) "\\|")))
(regexp
(if chinese-only
regexp
(if (> (length regexp) 0)
(if (equal string string1)
(concat string "\\|" regexp)
(concat string "\\|" string1 "\\|" regexp))
string))))
(format "\\(?:%s\\)" regexp))))
string-list ""))
(cl-defgeneric pyim-cregexp-create-from-imobj
(imobj _scheme &optional match-beginning
first-equal all-equal char-level-num)
"从 IMOBJ 创建一个搜索中文的 regexp.")
(cl-defmethod pyim-cregexp-create-from-imobj
(imobj (_scheme pyim-scheme-quanpin)
&optional match-beginning first-equal all-equal char-level-num)
"从 IMOBJ 创建一个搜索中文的 regexp, 适用于全拼输入法。"
(let* ((num (pyim-cregexp--char-level-num char-level-num))
(pinyin-list (pyim-cregexp--quanpin-get-pinyin-list imobj))
(cchars-list
(pyim-cregexp--quanpin-get-cchars-from-pinyin-list
pinyin-list all-equal first-equal num))
(regexp
(mapconcat (lambda (x)
(when (pyim-string-match-p "\\cc" x)
(format "[%s]" x)))
cchars-list "")))
(unless (equal regexp "")
(concat (if match-beginning "^" "") regexp))))
(defun pyim-cregexp--quanpin-get-pinyin-list (imobj)
"从 IMOBJ 生成类似 (\"ni\" \"hao\") 的拼音列表。"
(mapcar (lambda (x)
(concat (nth 0 x) (nth 1 x)))
imobj))
(defun pyim-cregexp--quanpin-get-cchars-from-pinyin-list
(pinyin-list all-equal first-equal char-level-num)
"(\"ni\" \"hao\") => (\"你 ... 蔫 ... 鸟 ... 宁 ...\" \"好号毫\")"
(let ((num (pyim-cregexp--char-level-num char-level-num))
(n 0)
results)
(dolist (py pinyin-list)
(let* ((equal-match
(or all-equal
(and first-equal (= n 0))))
(cchars (mapconcat
(lambda (x)
(mapconcat #'identity
(let* ((list (split-string x "|"))
(length (length list)))
(cl-subseq list 0 (min num length)))
""))
(pyim-pymap-py2cchar-get py equal-match nil t) "")))
(push cchars results))
(setq n (+ 1 n)))
(nreverse results)))
(cl-defmethod pyim-cregexp-create-from-imobj
(imobj (scheme pyim-scheme-xingma)
&optional match-beginning first-equal _all-equal _char-level-num)
"从 IMOBJ 创建一个搜索中文的 regexp, 适用于形码输入法。"
(let* ((code-prefix (pyim-scheme-code-prefix scheme))
(regexp (mapconcat
(lambda (x)
(let ((code (concat (or code-prefix "")
(if first-equal
(substring x 0 1)
x))))
(pyim-cregexp--build-xingma-regexp-from-words
(pyim-dcache-get code '(code2word)))))
imobj "")))
(unless (equal regexp "")
(concat (if match-beginning "^" "") regexp))))
(defun pyim-cregexp--build-xingma-regexp-from-words (words)
"根据 WORDS, 创建一个可以搜索这些 WORDS 的 regexp.
比如:工, 恭恭敬敬 => [工恭][恭]?[敬]?[敬]?
通过 \"[工恭][恭]?[敬]?[敬]?\" 可以搜索 \"工\" 和 \"恭恭敬敬\"."
(let ((n (apply #'max (mapcar #'length words)))
results)
(dotimes (i n)
(push (format "[%s]%s"
(mapconcat
(lambda (x)
(if (> i (- (length x) 1))
""
(char-to-string
(elt x i))))
words "")
(if (> i 0) "?" ""))
results))
(string-join (reverse results))))
;; * Footer
(provide 'pyim-cregexp)
;;; pyim-cregexp.el ends here