-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathgood_vs_evil.py
148 lines (113 loc) · 4.12 KB
/
good_vs_evil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#! /usr/bin/env python
'''
this classfier finds a specific voting ring when presented with data. For example
4 IP addresses can be voting in as voting ring. This classifier can detect them
on different articles or clumps.
This can also be used for finding IP address patterns like group of students hitting
specific website.
This can also be used to find specific plagerised text amongst chunks of text
'''
import sys, os
import re, string
from random import randrange
from nltk import FreqDist
############################################################
#
# this is the section that takes the data and process the
# incoming data in a selection of features and category
#
############################################################
feature_count = {}
category_count = {}
def train_from_data(data):
for category, documents in data.items():
for doc in documents.split():
train(doc, category)
def train(item, category):
features = get_features(item)
for f in features:
increment_feature(f, category)
increment_cat(category)
def increment_feature(feature, category):
feature_count.setdefault(feature,{})
feature_count[feature].setdefault(category, 0)
feature_count[feature][category] += 1
def increment_cat(category):
category_count.setdefault(category, 0)
category_count[category] += 1
############################################################
#
# calculating scores functions below
#
############################################################
def probability(item, category):
"""
probability: prob that an item is in a category
"""
category_prob = get_category_count(category) / sum(category_count.values())
return document_probability(item, category) * category_prob
def get_category_count(category):
if category in category_count:
return float(category_count[category])
else:
return 0.0
def document_probability(item, category):
features = get_features(item)
p = 1
for feature in features:
print "%s - %s - %s" % (feature, category, weighted_prob(feature, category))
p *= weighted_prob(feature, category)
return p
def get_features(document):
all_words = document.split()
all_words_freq = FreqDist(all_words)
print sorted(all_words_freq.items(), key=lambda(w,c):(-c, w))
return all_words_freq
def get_feature_count(feature, category):
if feature in feature_count and category in feature_count[feature]:
return float(feature_count[feature][category])
else:
return 0.0
def feature_prob(f, category):
if get_category_count(category) == 0:
return 0
return (get_feature_count(f, category) / get_category_count(category))
def weighted_prob(f, category, weight=1.0, ap=0.5):
basic_prob = feature_prob(f, category)
totals = sum([get_feature_count(f, category) for category in category_count.keys()])
w_prob = ((weight*ap) + (totals * basic_prob)) / (weight + totals)
return w_prob
# list all the attacks or voting rings below are
data = {"1.1.1.1":"2.2.2.2 3.3.3.3 4.4.4.4",
}
locations = {
}
# sliding window sampling magic
times = {
}
account_names = {}
def generate_ip():
b1 = randrange(0, 255, 1)
b2 = randrange(0, 255, 1)
b3 = randrange(0, 255, 1)
b4 = randrange(0, 255, 1)
octets = b1, b2, b3, b4
return ".".join([str(i) for i in octets])
def make_list(number):
ip_list = " "
for i in range(number):
ip_list += generate_ip() + " "
return ip_list
# other features like time, location,
voting_ring = "1.1.1.1 2.2.2.2 3.3.3.3 4.4.4.4"
good_ring = make_list(10)
normal_ring = "1.1.1.1 2.2.2.2" + make_list(8)
train_from_data(data)
bad_guy_score = probability(voting_ring, "1.1.1.1")
good_guy_score = probability(good_ring, "10.10.10.10")
normal_score = probability(normal_ring, "1.1.1.1")
print "Any score greater than zero is a probability of a voting ring"
print "*" * 50
print "I have identified voting ring", bad_guy_score
print "I know these votes are not a voting ring", good_guy_score
print "I believe these votes look normal", normal_score