Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
Acciente717 committed Jun 15, 2020
1 parent e1b08b3 commit 55afa6c
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 0 deletions.
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# 中文 .docx 文件查重工具
本工具采用随机算法计算指定文件夹内两两 .docx 文件间的相似性。

## 运行依赖安装
```
pip install python-docx thulac progressbar numpy
```

## 程序运行
```
usage: check_docx_similarity.py [-h] --dir DIR --out OUT
[--hash-width HASH_WIDTH]
[--hash-step HASH_STEP]
[--sample-cnt SAMPLE_CNT]
```

## 参数说明
#### `--dir`
指定包含 .docx 文件的文件夹
#### `--out`
指定输出 .csv 文件的路径
#### `--hash-width`
指定每个哈希块的词长,默认连续8个中文词语构成一个哈希块。增大该值会使 false positive 概率变小,但 false negative 概率变大。该参数几乎不影响计算速度。
#### `--hash-step`
指定相邻两个哈希块之间的间距,默认直接相邻(步长1)。增大该值会使 false positive 概率变小,但 false negative 概率变大。增大该参数能加快计算速度。
#### `--sample-cnt`
随机采样数。对于每个 .docx 文件,本程序生成一个高维向量,通过该高维向量间的相似性来判断文件相似性。向量的每一维均通过随机采样生成。此参数控制了高维向量的维数。增大该值会使结果更可靠,但显著增加计算时间。
147 changes: 147 additions & 0 deletions check_docx_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import os
import sys
import docx
import thulac
import hashlib
import argparse
import progressbar
import numpy as np


def init_progress_bar(maxval):
global BAR
BAR = progressbar.ProgressBar(maxval=maxval,
widgets=[progressbar.Bar('=', '[', ']'),' ', progressbar.Percentage()])
print('', end='', flush=True)


def tokens_of_file(path, thu):
doc = docx.Document(path)
string = ''.join([par.text.strip() for par in doc.paragraphs])

tokens = thu.cut(string, text=True)

tokens = [token for token in tokens.split(' ')
if token not in (',', '。', ':', ';', '《', '》', '、', ',', '.', ';', ':',
'\'', '"', '‘', '’', '“', '”', '(', ')', '(', ')',
'我', '得', '的', '地', '了', '会', '是', '你', '他', '她' ,'它')]
return tokens


def hash_tokens(tokens, width, step):
return [
hashlib.md5(''.join(tokens[idx:idx+width]).encode('utf-8')).digest()
for idx in range(0, len(tokens)-width, step)
]


def number_digests(digests):
return {
digest : number
for digest, number in zip(digests, range(len(digests)))
}


def minhash_vec_from_matrix(matrix):
np.random.shuffle(matrix)
row, col = matrix.shape

vec = np.zeros(col, dtype=np.uint32)
for c in range(col):
for r in range(row):
if matrix[r, c]:
vec[c] = r
break
return vec


def minhash_matrix_from_matrix(matrix, hash_cnt):
vecs = []
init_progress_bar(hash_cnt)
BAR.start()
for i in range(hash_cnt):
vecs.append(minhash_vec_from_matrix(matrix))
BAR.update(i)
minhash_matrix = np.vstack(vecs)
BAR.finish()
return minhash_matrix


def similarity(vec1, vec2):
return sum(vec1 == vec2)


def pairwise_similarity(minhash_matrix):
_, col = minhash_matrix.shape
return [
(left, right, similarity(minhash_matrix[:,left], minhash_matrix[:,right]))
for left in range(col-1)
for right in range(left+1, col)
]


def parse_args():
parser = argparse.ArgumentParser(description='Check pairwise similarities of .docx files.')
parser.add_argument('--dir', help='the directory containing .docx files', required=True)
parser.add_argument('--out', help='the output file path', required=True)
parser.add_argument('--hash-width', help='the word length of a hashing block (default 8)', default=8)
parser.add_argument('--hash-step', help='the word step between hashing block (default 1)', default=1)
parser.add_argument('--sample-cnt', help='sample count (default 1000)', default=1000)

args = parser.parse_args()
args.thu = thulac.thulac(seg_only=True)
return args


def main():
args = parse_args()

filenames = [file for file in os.listdir(args.dir)
if os.path.splitext(file)[1] == '.docx']

all_digests = []
filename_to_digests = {}

print('Digesting %d files...' % len(filenames), flush=True)
init_progress_bar(len(filenames))
BAR.start()
for idx, filename in enumerate(filenames):
path = os.path.join(args.dir, filename)
digests = set(hash_tokens(tokens_of_file(path, args.thu), args.hash_width, args.hash_step))
all_digests.append(digests)
filename_to_digests[filename] = digests
BAR.update(idx)
BAR.finish()
print('', end='', file=sys.stderr, flush=True)
print(flush=True)

all_digests = sorted(list(set.union(*all_digests)))
digest_to_number = number_digests(all_digests)

matrix = np.zeros((len(all_digests), len(filenames)), dtype=np.bool)

print('Calculating MinHashes...', flush=True)
for idx, filename in enumerate(filenames):
for digest in filename_to_digests[filename]:
row_num = digest_to_number[digest]
matrix[row_num, idx] = True

minhash_matrix = minhash_matrix_from_matrix(matrix, args.sample_cnt)
print('', end='', file=sys.stderr, flush=True)
print(flush=True)

print('Calculating similarities between files...', flush=True)
similarities = pairwise_similarity(minhash_matrix)
similarities.sort(key=lambda x: x[2], reverse=True)

print('Writing to %s ...' % args.out, flush=True)
with open(args.out, 'wt') as f:
print('file1,file2,similarity', file=f)
for left, right, sim in similarities:
sim = sim * 100 / args.sample_cnt
print('%s,%s,%.1f%%' % (filenames[left], filenames[right], sim), file=f)
print('Done.', flush=True)


if __name__ == '__main__':
main()

0 comments on commit 55afa6c

Please sign in to comment.