-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e1b08b3
commit 55afa6c
Showing
2 changed files
with
174 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# 中文 .docx 文件查重工具 | ||
本工具采用随机算法计算指定文件夹内两两 .docx 文件间的相似性。 | ||
|
||
## 运行依赖安装 | ||
``` | ||
pip install python-docx thulac progressbar numpy | ||
``` | ||
|
||
## 程序运行 | ||
``` | ||
usage: check_docx_similarity.py [-h] --dir DIR --out OUT | ||
[--hash-width HASH_WIDTH] | ||
[--hash-step HASH_STEP] | ||
[--sample-cnt SAMPLE_CNT] | ||
``` | ||
|
||
## 参数说明 | ||
#### `--dir` | ||
指定包含 .docx 文件的文件夹 | ||
#### `--out` | ||
指定输出 .csv 文件的路径 | ||
#### `--hash-width` | ||
指定每个哈希块的词长,默认连续8个中文词语构成一个哈希块。增大该值会使 false positive 概率变小,但 false negative 概率变大。该参数几乎不影响计算速度。 | ||
#### `--hash-step` | ||
指定相邻两个哈希块之间的间距,默认直接相邻(步长1)。增大该值会使 false positive 概率变小,但 false negative 概率变大。增大该参数能加快计算速度。 | ||
#### `--sample-cnt` | ||
随机采样数。对于每个 .docx 文件,本程序生成一个高维向量,通过该高维向量间的相似性来判断文件相似性。向量的每一维均通过随机采样生成。此参数控制了高维向量的维数。增大该值会使结果更可靠,但显著增加计算时间。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
import os | ||
import sys | ||
import docx | ||
import thulac | ||
import hashlib | ||
import argparse | ||
import progressbar | ||
import numpy as np | ||
|
||
|
||
def init_progress_bar(maxval): | ||
global BAR | ||
BAR = progressbar.ProgressBar(maxval=maxval, | ||
widgets=[progressbar.Bar('=', '[', ']'),' ', progressbar.Percentage()]) | ||
print('', end='', flush=True) | ||
|
||
|
||
def tokens_of_file(path, thu): | ||
doc = docx.Document(path) | ||
string = ''.join([par.text.strip() for par in doc.paragraphs]) | ||
|
||
tokens = thu.cut(string, text=True) | ||
|
||
tokens = [token for token in tokens.split(' ') | ||
if token not in (',', '。', ':', ';', '《', '》', '、', ',', '.', ';', ':', | ||
'\'', '"', '‘', '’', '“', '”', '(', ')', '(', ')', | ||
'我', '得', '的', '地', '了', '会', '是', '你', '他', '她' ,'它')] | ||
return tokens | ||
|
||
|
||
def hash_tokens(tokens, width, step): | ||
return [ | ||
hashlib.md5(''.join(tokens[idx:idx+width]).encode('utf-8')).digest() | ||
for idx in range(0, len(tokens)-width, step) | ||
] | ||
|
||
|
||
def number_digests(digests): | ||
return { | ||
digest : number | ||
for digest, number in zip(digests, range(len(digests))) | ||
} | ||
|
||
|
||
def minhash_vec_from_matrix(matrix): | ||
np.random.shuffle(matrix) | ||
row, col = matrix.shape | ||
|
||
vec = np.zeros(col, dtype=np.uint32) | ||
for c in range(col): | ||
for r in range(row): | ||
if matrix[r, c]: | ||
vec[c] = r | ||
break | ||
return vec | ||
|
||
|
||
def minhash_matrix_from_matrix(matrix, hash_cnt): | ||
vecs = [] | ||
init_progress_bar(hash_cnt) | ||
BAR.start() | ||
for i in range(hash_cnt): | ||
vecs.append(minhash_vec_from_matrix(matrix)) | ||
BAR.update(i) | ||
minhash_matrix = np.vstack(vecs) | ||
BAR.finish() | ||
return minhash_matrix | ||
|
||
|
||
def similarity(vec1, vec2): | ||
return sum(vec1 == vec2) | ||
|
||
|
||
def pairwise_similarity(minhash_matrix): | ||
_, col = minhash_matrix.shape | ||
return [ | ||
(left, right, similarity(minhash_matrix[:,left], minhash_matrix[:,right])) | ||
for left in range(col-1) | ||
for right in range(left+1, col) | ||
] | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser(description='Check pairwise similarities of .docx files.') | ||
parser.add_argument('--dir', help='the directory containing .docx files', required=True) | ||
parser.add_argument('--out', help='the output file path', required=True) | ||
parser.add_argument('--hash-width', help='the word length of a hashing block (default 8)', default=8) | ||
parser.add_argument('--hash-step', help='the word step between hashing block (default 1)', default=1) | ||
parser.add_argument('--sample-cnt', help='sample count (default 1000)', default=1000) | ||
|
||
args = parser.parse_args() | ||
args.thu = thulac.thulac(seg_only=True) | ||
return args | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
|
||
filenames = [file for file in os.listdir(args.dir) | ||
if os.path.splitext(file)[1] == '.docx'] | ||
|
||
all_digests = [] | ||
filename_to_digests = {} | ||
|
||
print('Digesting %d files...' % len(filenames), flush=True) | ||
init_progress_bar(len(filenames)) | ||
BAR.start() | ||
for idx, filename in enumerate(filenames): | ||
path = os.path.join(args.dir, filename) | ||
digests = set(hash_tokens(tokens_of_file(path, args.thu), args.hash_width, args.hash_step)) | ||
all_digests.append(digests) | ||
filename_to_digests[filename] = digests | ||
BAR.update(idx) | ||
BAR.finish() | ||
print('', end='', file=sys.stderr, flush=True) | ||
print(flush=True) | ||
|
||
all_digests = sorted(list(set.union(*all_digests))) | ||
digest_to_number = number_digests(all_digests) | ||
|
||
matrix = np.zeros((len(all_digests), len(filenames)), dtype=np.bool) | ||
|
||
print('Calculating MinHashes...', flush=True) | ||
for idx, filename in enumerate(filenames): | ||
for digest in filename_to_digests[filename]: | ||
row_num = digest_to_number[digest] | ||
matrix[row_num, idx] = True | ||
|
||
minhash_matrix = minhash_matrix_from_matrix(matrix, args.sample_cnt) | ||
print('', end='', file=sys.stderr, flush=True) | ||
print(flush=True) | ||
|
||
print('Calculating similarities between files...', flush=True) | ||
similarities = pairwise_similarity(minhash_matrix) | ||
similarities.sort(key=lambda x: x[2], reverse=True) | ||
|
||
print('Writing to %s ...' % args.out, flush=True) | ||
with open(args.out, 'wt') as f: | ||
print('file1,file2,similarity', file=f) | ||
for left, right, sim in similarities: | ||
sim = sim * 100 / args.sample_cnt | ||
print('%s,%s,%.1f%%' % (filenames[left], filenames[right], sim), file=f) | ||
print('Done.', flush=True) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |