Initial commit.

Acciente717 · Jun 15, 2020 · 55afa6c · 55afa6c
1 parent e1b08b3
commit 55afa6c
Show file tree

Hide file tree

Showing 2 changed files with 174 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,27 @@
+# 中文 .docx 文件查重工具
+本工具采用随机算法计算指定文件夹内两两 .docx 文件间的相似性。
+
+## 运行依赖安装
+```
+pip install python-docx thulac progressbar numpy
+```
+
+## 程序运行
+```
+usage: check_docx_similarity.py [-h] --dir DIR --out OUT
+                                [--hash-width HASH_WIDTH]
+                                [--hash-step HASH_STEP]
+                                [--sample-cnt SAMPLE_CNT]
+```
+
+## 参数说明
+#### `--dir`
+指定包含 .docx 文件的文件夹
+#### `--out`
+指定输出 .csv 文件的路径
+#### `--hash-width`
+指定每个哈希块的词长，默认连续8个中文词语构成一个哈希块。增大该值会使 false positive 概率变小，但 false negative 概率变大。该参数几乎不影响计算速度。
+#### `--hash-step`
+指定相邻两个哈希块之间的间距，默认直接相邻（步长1）。增大该值会使 false positive 概率变小，但 false negative 概率变大。增大该参数能加快计算速度。
+#### `--sample-cnt`
+随机采样数。对于每个 .docx 文件，本程序生成一个高维向量，通过该高维向量间的相似性来判断文件相似性。向量的每一维均通过随机采样生成。此参数控制了高维向量的维数。增大该值会使结果更可靠，但显著增加计算时间。
diff --git a/check_docx_similarity.py b/check_docx_similarity.py
@@ -0,0 +1,147 @@
+import os
+import sys
+import docx
+import thulac
+import hashlib
+import argparse
+import progressbar
+import numpy as np
+
+
+def init_progress_bar(maxval):
+    global BAR
+    BAR = progressbar.ProgressBar(maxval=maxval,
+                                  widgets=[progressbar.Bar('=', '[', ']'),' ', progressbar.Percentage()])
+    print('', end='', flush=True)
+
+
+def tokens_of_file(path, thu):
+    doc = docx.Document(path)
+    string = ''.join([par.text.strip() for par in doc.paragraphs])
+
+    tokens = thu.cut(string, text=True)
+
+    tokens = [token for token in tokens.split(' ')
+              if token not in ('，', '。', '：', '；', '《', '》', '、', ',', '.', ';', ':',
+                               '\'', '"', '‘', '’', '“', '”', '(', ')', '（', '）', 
+                               '我', '得', '的', '地', '了', '会', '是', '你', '他', '她' ,'它')]
+    return tokens
+
+
+def hash_tokens(tokens, width, step):
+    return [
+        hashlib.md5(''.join(tokens[idx:idx+width]).encode('utf-8')).digest()
+        for idx in range(0, len(tokens)-width, step)
+    ]
+
+
+def number_digests(digests):
+    return {
+        digest : number
+        for digest, number in zip(digests, range(len(digests)))
+    }
+
+
+def minhash_vec_from_matrix(matrix):
+    np.random.shuffle(matrix)
+    row, col = matrix.shape
+
+    vec = np.zeros(col, dtype=np.uint32)
+    for c in range(col):
+        for r in range(row):
+            if matrix[r, c]:
+                vec[c] = r
+                break
+    return vec
+
+
+def minhash_matrix_from_matrix(matrix, hash_cnt):
+    vecs = []
+    init_progress_bar(hash_cnt)
+    BAR.start()
+    for i in range(hash_cnt):
+        vecs.append(minhash_vec_from_matrix(matrix))
+        BAR.update(i)
+    minhash_matrix = np.vstack(vecs)
+    BAR.finish()
+    return minhash_matrix
+
+
+def similarity(vec1, vec2):
+    return sum(vec1 == vec2)
+
+
+def pairwise_similarity(minhash_matrix):
+    _, col = minhash_matrix.shape
+    return [
+        (left, right, similarity(minhash_matrix[:,left], minhash_matrix[:,right]))
+        for left in range(col-1)
+        for right in range(left+1, col)
+    ]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Check pairwise similarities of .docx files.')
+    parser.add_argument('--dir', help='the directory containing .docx files', required=True)
+    parser.add_argument('--out', help='the output file path', required=True)
+    parser.add_argument('--hash-width', help='the word length of a hashing block (default 8)', default=8)
+    parser.add_argument('--hash-step', help='the word step between hashing block (default 1)', default=1)
+    parser.add_argument('--sample-cnt', help='sample count (default 1000)', default=1000)
+
+    args = parser.parse_args()
+    args.thu = thulac.thulac(seg_only=True)
+    return args
+
+
+def main():
+    args = parse_args()
+
+    filenames = [file for file in os.listdir(args.dir)
+                 if os.path.splitext(file)[1] == '.docx']
+
+    all_digests = []
+    filename_to_digests = {}
+
+    print('Digesting %d files...' % len(filenames), flush=True)
+    init_progress_bar(len(filenames))
+    BAR.start()
+    for idx, filename in enumerate(filenames):
+        path = os.path.join(args.dir, filename)
+        digests = set(hash_tokens(tokens_of_file(path, args.thu), args.hash_width, args.hash_step))
+        all_digests.append(digests)
+        filename_to_digests[filename] = digests
+        BAR.update(idx)
+    BAR.finish()
+    print('', end='', file=sys.stderr, flush=True)
+    print(flush=True)
+
+    all_digests = sorted(list(set.union(*all_digests)))
+    digest_to_number = number_digests(all_digests)
+
+    matrix = np.zeros((len(all_digests), len(filenames)), dtype=np.bool)
+
+    print('Calculating MinHashes...', flush=True)
+    for idx, filename in enumerate(filenames):
+        for digest in filename_to_digests[filename]:
+            row_num = digest_to_number[digest]
+            matrix[row_num, idx] = True
+
+    minhash_matrix = minhash_matrix_from_matrix(matrix, args.sample_cnt)
+    print('', end='', file=sys.stderr, flush=True)
+    print(flush=True)
+
+    print('Calculating similarities between files...', flush=True)
+    similarities = pairwise_similarity(minhash_matrix)
+    similarities.sort(key=lambda x: x[2], reverse=True)
+
+    print('Writing to %s ...' % args.out, flush=True)
+    with open(args.out, 'wt') as f:
+        print('file1,file2,similarity', file=f)
+        for left, right, sim in similarities:
+            sim = sim * 100 / args.sample_cnt
+            print('%s,%s,%.1f%%' % (filenames[left], filenames[right], sim), file=f)
+    print('Done.', flush=True)
+
+
+if __name__ == '__main__':
+    main()