-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdomain_dictionaries.py
85 lines (68 loc) · 2.53 KB
/
domain_dictionaries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
__author__ = 'matic'
import os,re
#replaces all keys from dict with its' values
def replace_all(text, dic):
for i, j in sorted(dic.iteritems(),key=lambda a: len(a[0].split(" ")),reverse=True):
#print i
i=i.replace("[","\[").replace("]","\]").replace("(","\(").replace(")","\)").replace(".","\.").replace("'","\'")
rxstr=r'(?:(?<=\s)|(?<=^))'+i+r'(?=\s|$)'
count=len(re.findall(rxstr, text))
#print i
#text = re.sub(r'(?i)\b'+i+r'\b', j, text)#text.replace(i, j)
text = re.sub(rxstr, j, text)
return text,count
def count(file_name, dic):
file_name=file_name.split(".")[0]
fajl=open(file_name+".txt", 'r')
line = fajl.readline()
cnt=0
while line:
if line!="\n":
for word in dic.keys():
a=line.count(word)
#print word,a
cnt+=a
line = fajl.readline()
fajl.close()
return cnt
def use_domain_dictionary(file_name,dictionaries_path):
file_name=file_name.split(".")[0]
dictionary={}
dict_names=os.listdir(dictionaries_path)
for dict_name in dict_names:
fajl=open(dictionaries_path+"/"+dict_name, 'r')
line = fajl.readline() # Invokes readline() method on file
while line:
if line!="\n":
#spl=line.split("\t")
#print dict_name
#print dict_name
terms=line.replace("_"," ").replace("\n ","\n").split("\n")[0].replace(" "," ").replace(" "," ").replace(", ",",").split(",")
terms=[asd for asd in terms if asd!=""]
first_term=terms[0]
for term in terms:
dictionary[term]=first_term
line = fajl.readline()
#print "row_perm:",row_perm_rev
fajl.close()
print dictionary
bterms_fajl=open(file_name+"_dict_keys.txt", 'w')
for bterm in set(dictionary.values()):
bterms_fajl.write(bterm+",")
bterms_fajl.close()
fajl=open(file_name+".txt", 'r')
new_fajl=open(file_name+"_dict.txt", 'w')
line = fajl.readline() # Invokes readline() method on file
count=0
while line:
if line!="\n":
lod=line.split("\n")[0]
replaced,countt=replace_all(lod,dictionary)
new_fajl.write(replaced+"\n")
count+=countt
line = fajl.readline()
#print "row_perm:",row_perm_rev
fajl.close()
new_fajl.close()
print count#(file_name+"_dict.txt",dictionary)
#use_domain_dictionary("pdr","dictionaries")