forked from TheKnight909/Emotional-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreProcess.py
145 lines (122 loc) ยท 10.2 KB
/
preProcess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import re
import regex
import nltk
from pyarabic import araby
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
import emoji
# Ensure you have downloaded the Arabic stop words
nltk.download('stopwords')
class ArabicTextPreprocessor:
def __init__(self):
self.emojis = {
"๐":"ุถุญู", "๐คฃ":"ุถุญู", "๐":"ุถุญู", "๐":"ุญุฒู", "๐":"ุถุญู", "โค๏ธ":"ุญุจ", "โค":"ุญุจ", "๐":"ุญุจ", "๐ญ":"ุญุฒู", "๐ข":"ุญุฒู",
"๐":"ุญุฒู", "โฅ":"ุญุจ", "๐":"ุญุจ", "๐
":"ุถุญู", "๐":"ุญุฒู", "๐":"ุญุจ", "๐":"ุญุจ", "๐":"ุญุฒู", "๐":"ูุฑุญ", "๐":"ูุตูู",
"๐":"ุงุญุณูุช", "๐ด":"ููู
", "๐":"ุถุญู", "๐":"ุญุฒู", "๐น":"ูุฑุฏุฉ", "๐":"ุญุจ", "๐":"ุถุญู", "๐":"ู
ุญุงูุฏ", "โ":"ู
ูุชุตุฑ", "โจ":"ูุฌู
ู",
"๐ค":"ุชูููุฑ", "๐":"ูุณุชูุฒุก", "๐":"ุบุถุจ", "๐":"ู
ูู", "๐":"ุบุถุจ", "๐":"ุถุญู", "๐ธ":"ูุฑุฏุฉ", "๐":"ุญุฒู", "๐":"ุญุจ", "๐":"ุญุจ",
"๐":"ุบุถุจ", "๐ญ":"ุชูููุฑ", "๐":"ุซูุฉ", "๐":"ุญุจ", "๐ฉ":"ุญุฒู", "๐ช":"ุนุถูุงุช", "๐":"ู
ูุงูู", "๐๐ป":"ุดูุฑ", "๐ณ":"ููู", "๐๐ผ":"ุชุตููู",
"๐ถ":"ู
ูุณููู", "๐":"ุตู
ุช", "๐":"ุญุจ", "๐":"ุดูุฑ", "๐":"ุญุจ", "๐":"ุณูุงู
", "โบ":"ุถุญู", "๐ธ":"ุถูุฏุน", "๐ถ":"ููู", "โ๐ป":"ุชููู",
"๐":"ูุฑุญ", "๐ท":"ุญุจ", "๐":"ูุฑุญ", "๐ซ":"ุญุฒู", "๐จ":"ุฎูู", "๐ผ ":"ู
ูุณููู", "๐":"ู
ุฑุญ", "๐":"ู
ุฑุญ", "๐":"ุญุจ", "๐ช":"ุญุฒู",
"๐":"ุถุญู", "๐ฃ":"ุบุถุจ", "โบ๏ธ":"ุญุจ", "๐ฑ":"ุฎูู", "๐":"ุถุญู", "๐":"ุงุณุชูุงุก", "๐๐ผ":"ูุฌุฑู", "๐ก":"ุบุถุจ", "๐ถ":"ูุณูุฑ", "๐ค":"ู
ุฑุถ",
"โผ๏ธ":"ุชุนุฌุจ", "๐":"ุทุงุฆุฑ", "๐๐ป":"ุงุญุณูุช", "โฃ":"ุญุจ", "๐":"ู
ุตุฏูู
", "๐":"ู
ุฑุญ", "๐๐ผ":"ู
ุฑุญ", "๐":"ู
ุฑุญ", "๐":"ุถุฑุจุฉ", "๐":"ุงุณุชูุงุก",
"๐":"ุญุจ", "๐ฅ":"ุญุฒู", "๐ป":"ู
ูุณููู", "โ":"ููุชุจ", "๐ถ๐ป":"ูุณูุฑ", "๐":"ุงูู
ุงุณ", "๐ท":"ู
ุฑุถ", "โ":"ูุงุญุฏ", "๐ฌ":"ุชุฏุฎูู",
"๐" : "ูุฑุฏ", "๐" : "ุดู
ุณ", "๐" : "ุงูุงูู", "โ ๏ธ" :"ุชุญุฐูุฑ", "๐ค" : "ุงุญุชูุงุก", "โ๏ธ": "ุบูุท", "๐" : "ู
ูุงู", "๐ธ" : "ู
ููู",
"๐" : "ุชุงุฌ", "โ๏ธ" : "ุตุญ", "๐": "ููุจ", "๐ฒ" : "ู
ูุฏูุด", "๐ฆ": "ู
ุงุก", "๐ซ" : "ุฎุทุง", "๐๐ป" : "ู
ู
ุชุงุฒ", "๐" :"ูุณุจุญ", "๐๐ป": "ุชู
ุงู
",
"โญ๏ธ" :"ุฏุงุฆุฑู", "๐ท" : "ู
ูุณููู", "๐": "ุชูููุญ ุจุงููุฏ", "โ๐ผ": "ุนูุงู
ู ุงููุตุฑ", "๐":"ุถุญู", "โฟ" : "ุนูุฏู ู
ุฒุฏูุฌู", "๐ช๐ผ" : "ููู",
"๐ฉ": "ุชูุงุตู ู
ุนู", "โ๏ธ": "ูููู", "๐ง" : "ุบุถุจ", "๐จ": "ุฑุณุงูุฉ", "โ๏ธ" :"ุชุนุฌุจ", "๐๐ป": "ุงุดุงุฑู ู
ูุงููู", "๐ฏ" :"ุงุฎูุงุช", "ยฉ" : "ุฑู
ุฒ",
"๐ต๐ฝ" :"ุณูุฏู ุนุฌูุฒู", "๐ฃ": "ูุชููุช", "๐": "ุชุดุฌูุน", "๐": "ุดุฎุต ููุญูู", "๐๐ฝ":"ุงูุฏู ู
ูุชูุญู", "๐๐ฝ": "ุจุงูุธุจุท", "โ๏ธ" : "ุงุณุชููุงุฑ",
"โฝ๏ธ": "ููุฑู", "๐ถ" :"ุญุจ", "๐" :"ุจุงููู", "๐": "ูุฑุฏู", "๐ต": "ูููุณ", "๐": "ูุฑุญ", "๐": "ูุฑุญ", "๐ ": "ุบุถุจ", "โ๐ป": "ููุชุจ",
"๐พ": "ุงุฑุฒ", "๐ฃ": "ุงุซุฑ ูุฏู
ูู", "โ":"ุฑูุถ", "๐":"ุทุนุงู
", "๐ฌ":"ุตุฏุงูุฉ", "๐ฐ":"ุงุฑูุจ", "โ":"ู
ุทุฑ", "โ":"ู
ู
ููุฉ ูุฑูุณุง", "๐":"ุฎุฑูู",
"๐ฃ":"ุตูุช ู
ุฑุชูุน", "๐๐ผ":"ุงุญุณูุช", "โ":"ู
ุฑุญ", "๐ฎ":"ุฎูู", "๐ฆ":"ุฎูู", "โญ":"ุงูุญู", "โ๏ธ":"ููู
", "โน":"ู
ุนููู
ุงุช", "๐๐ป":"ุฑูุถ", "โช๏ธ":"ูุถุงุฑุฉ ููุงุก",
"๐ค":"ุญุฒู", "๐ซ":"ู
ุฑุญ", "๐":"ุญุจ", "๐":"ุทุนุงู
", "โค๏ธ":"ุญุจ", "โ๏ธ":"ุณูุฑ", "๐๐ปโโ๏ธ":"ูุณูุฑ", "๐ณ":"ุฐูุฑ", "๐ค":"ู
ุงูู ุบูุงุก", "๐พ":"ูุฑู", "๐":"ุฏุฌุงุฌุฉ",
"๐":"ุณุคุงู", "๐ฎ":"ุจุญุฑ", "๐":"ุฏูุงุก", "๐๐ผ":"ุดูุฑ", "๐๐ฟ ":"ุญุงุฑุณ", "๐ฌ":"ุณููู
ุง", "โฆ๏ธ":"ู
ุฑุญ", "๐ก":"ููุฑุฉ", "โผ":"ุชุนุฌุจ", "๐ผ":"ุทูู", "๐":"ู
ูุชุงุญ",
"โฅ๏ธ":"ุญุจ", "๐":"ูุนุจุฉ", "๐":"ุฏุฌุงุฌุฉ", "๐ฉ":"ู
ุนุชุฑุถ", "๐ฝ":"ูุถุงุฆู", "โ๏ธ":"ู
ุทุฑ", "๐ท":"ุนุตูุฑ", "๐":"ูุฌู
ุฉ", "โ๏ธ":"ุณุญุจ", "๐":"ู
ุนุชุฑุถ", "๐บ":"ู
ุฑุญ",
"๐ช":"ุณูููุฉ", "โจ":"ุณุฎูููุฉ", "๐๐ผ":"ุถุฑุจ", "โ":"ููู
", "๐ถ๐พโโ๏ธ":"ูุณูุฑ", "๐":"ุถุฑุจ", "๐":"ุญุจ", "๐ธ":"ู
ุฑุญ", "๐๐ป":"ูุง ูุนุฌุจูู", "๐๐ฝ":"ุถุฑุจุฉ", "๐":"ุญุจ",
"๐ฅ":"ุชุตููุฑ", "๐":"ุฌุฐุจ ุงูุชุจุงู", "๐๐ฝ":"ูุตูู", "๐ช๐ป":"ุนุถูุงุช", "๐ด":"ุงุณูุฏ", "๐ฅ":"ุญุฑูู", "๐ฌ":"ุฎูู", "๐๐ฟ":"ูุถุฑุจ", "๐ฟ":"ูุฑูู ุดุฌุฑู", "โ๐ผ":"ูู ุงูุฏ",
"๐":"ุงูุฏู ู
ูุชูุญู", "โ ๏ธ":"ุฑุนุจ", "๐":"ูููุฆ", "๐" :"ุตุงู
ุช", "๐ฟ":"ุญุฒู", "โน๏ธ":"ุญุฒู", "๐" :"ุญุจ", "๐ฐ" :"ุฎูู ู ุญุฒู", "๐ผ":"ูุฑุฏู", "๐":"ุจูุณู",
"๐":"ูุงุณูู", "โฃ๏ธ":"ุญุจ", "๐ง":"ุณู
ุงุนุงุช", "๐":"ููุชุจ", "๐":"ุณุนูุฏ", "๐":"ุฑุนุจ", "๐":"ูุฌุฑู", "โ๐ป":"ุนูุงู
ู ุงููุตุฑ", "๐ซ":"ูุถุฑุจ", "โ๏ธ":"ุชุนุฌุจ",
"๐":"ุบูุฑ ู
ูุงูู", "๐":"ููู", "๐":"ูููู
ูู", "โข":"ุฑู
ุฒ", "๐ถ๐ฝ":"ูุชู
ุดู", "๐ฏ":"ู
ุชูุงุฌุฃ", "โ":"ูุฏ ู
ุบููู", "๐ป":"ุงุนุฌุงุจ", "๐" :"ูุฑุฏ", "๐ง":"ุทููู",
"๐ด":"ุฏุงุฆุฑู ุญู
ุฑุงุก", "๐ช๐ฝ":"ููู", "๐ค":"ููู
", "๐":"ุญูุฑู", "โ๐ป":"ููุชุจ", "โ๏ธ":"ุชูุฌ", "๐":"ุฑุนุจ", "๐ค":"ุบุถุจ", "๐":"ููู
", "๐ฉ":"ูุงุจ", "โ๏ธ":"ูููู",
"๐น":"ุถุญู", "๐":"ุญุจ", "โ๏ธ ":"ูุงุฑ", "๐ป":"ุฑุนุจ", "๐คฎ":"ู
ูุฑู", "๐คข":"ู
ูุฑู", "๐คช":"ู
ุฑุญ", "๐ฅด":"ุชุนุจ", "๐คง":"ู
ุฑุถ", "๐ค":"ู
ุฑุถ", "๐ค":"ู
ุฑุถ", "๐ค":"ู
ุฑุญ",
"๐ค":"ุตู
ุช", "๐คซ":"ุตู
ุช", "๐คญ":"ุถุญู", "๐ง":"ุชูููุฑ", "๐ค":"ุฐูุงุก", "๐คฉ":"ู
ุฑุญ", "๐ฅณ":"ู
ุฑุญ", "๐ฅบ":"ุญุฒู", "๐คฅ":"ูุฐุจ", "๐ค":"ุชูููุฑ", "๐ค":"ู
ุฑุญ", "๐ฅฐ":"ู
ุฑุญ",
"๐ค":"ุญุจ", "๐คฒ":"ุฏุนุงุก", "๓พด":"", "๐คฆ":"ุงูู
", "๐คทโ":"ููู", "๐ค":"ูุฏ", "๐ฆ":"ู
ุฑุญ", "๐ฅ":"ููุฒ",
}
self.emoticons = {
":))": "ุถุญู", "((:": "ุถุญู", ":)": "ุถุญู", "(:": "ุถุญู",
":(": "ุญุฒู", "):": "ุญุฒู", "xD": "ุถุญู", "XD": "ุถุญู",
":=(": "ูุจูู", ":'(": "ุญุฒู", ":'โ(": "ุญุฒู", "XD" : "ุถุญู",
":D" : "ุถุญู", "โฌ" : "ู
ูุณููู", "โก" : "ุญุจ", "โป" : "ุถุญู",
}
self.negation_words = [
"ูุณุช","ู
ุจ","ุบูุฑ", "ููุณ", "ุณูู", "ูู
", "ูู", "ู
ุง", "ูุง", "ุจูุง", "ุจุฏูู", "ุนูุฑ", "ุนุฏุง", "ููุง"
]
self.stemmer = ISRIStemmer()
def preprocess_text(self, text):
text = self.replace_emojis_with_text(text)
text = self.replace_emoticons_with_text(text)
text = self.remove_stop_words(text)
text = self.remove_urls(text)
text = self.remove_non_arabic(text)
text = self.remove_numbers(text)
text = self.normalize_arabic(text)
text = self.remove_punctuations(text)
text = self.lemmatize_arabic(text)
return text
def remove_stop_words(self, text):
arabic_stopwords = set(stopwords.words('arabic'))
# Exclude negation words from the stop words list
stopwords_to_remove = [word for word in arabic_stopwords if word not in self.negation_words]
return " ".join([word for word in text.split() if word not in stopwords_to_remove])
def lemmatize_arabic(self, text):
return " ".join([self.stemmer.stem(word) for word in text.split()])
def normalize_arabic(self, text):
text = text.strip()
text = re.sub("ู", "ู", text)
text = re.sub("ุค", "ุก", text)
text = re.sub("ุฆ", "ุก", text)
text = re.sub("ุฉ", "ู", text)
#remove repetetions
text = re.sub("[ุฅุฃูฑุขุง]", "ุง", text)
text = text.replace('ูู', 'ู')
text = text.replace('ูู', 'ู')
text = text.replace('ููู', 'ู')
text = text.replace('ุงุง', 'ุง')
#Remove extra whitespace
text = re.sub('\s+', ' ', text)
#Remove longation
text = re.sub(r'(.)\1+', r"\1\1", text)
#Strip vowels from a text, include Shadda.
text = araby.strip_tashkeel(text)
#Strip diacritics from a text, include harakats and small lettres The striped marks are
text = araby.strip_diacritics(text)
text=''.join([i for i in text if not i.isdigit()])
return text
def remove_non_arabic(self, text):
return re.sub('[A-Za-z]+', ' ', text)
def remove_numbers(self, text):
return ''.join([i for i in text if not i.isdigit()])
def remove_punctuations(self, text):
# Remove punctuations
text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,ุ-./:;<=>ุ?@[\]^_`{|}~"0123456789\\A-Za-zโขโ"""), ' ', text)
text = text.replace('ุ',"", )
# Remove extra whitespace
text = re.sub('\s+', ' ', text)
text = " ".join(text.split())
return text.strip()
def remove_urls(self, text):
url_pattern = re.compile(r'https?://\S+|www\.\S+')
return url_pattern.sub(r'', text)
def replace_emojis_with_text(self, text):
translated_text = ""
for char in text:
if any(emoji.distinct_emoji_list(char) for char in char):
translated_text += " " + self.emojis.get(char, char) + " "
else:
translated_text += self.emojis.get(char, char)
return translated_text
def replace_emoticons_with_text(self, text):
translated_text = ""
seperarate_word = text.split(' ')
for word in seperarate_word:
translated_text += self.emoticons.get(word, word) + " "
return translated_text