-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathtag_clustering.py
78 lines (59 loc) · 1.9 KB
/
tag_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
# encoding: utf-8
import sys, os
import csv
import time
import numpy
from Pycluster import *
data = {}
all_tags = []
all_urls = []
numerical_data = []
filename = "/Users/antigen/dev/ml_class/intro_web_data/links.csv"
clustered_urls = {}
clustered_tags = {}
i = 0
def load_data(filename):
fh = csv.reader(open(filename, 'r'))
for row in fh:
data[row[0]] = row[1].split(',')
return data
def process_urls_tags(data, all_urls, all_tags):
for url, tags in data.items():
all_urls.append(url)
all_tags.extend(tags)
all_tags = list(set(all_tags))
return all_tags
def create_vectors(data, all_tags):
for url, tags in data.items():
v = []
for t in all_tags:
if t in tags:
v.append(1)
else:
v.append(0)
numerical_data.append(tuple(v))
vectors = numpy.array(numerical_data)
return vectors
def cluster_items(vectors):
# kcluster(data, nclusters=20, dist='e') # euclidean distance
# kcluster(data, nclusters=20, dist='b',npass=10) #city-block distance
labels, error, nfound = kcluster(vectors, nclusters=30, dist='a', npass=10)
print "just before the return"
return labels
def print_data(labels, all_urls, clustered_urls, i):
for url in all_urls:
clustered_urls.setdefault(labels[i], []).append(url)
clustered_tags.setdefault(labels[i], []).extend(data[url])
i += 1
for cluster_id, urls in clustered_urls.items():
print cluster_id
print urls
# for cluster_id,tags in clustered_tags.items():
# print cluster_id
# print list(set(tags))
data = load_data(filename)
processed = process_urls_tags(data, all_urls, all_tags)
vectors = create_vectors(data, all_tags)
labels = cluster_items(vectors)
printed = print_data(labels, all_urls, clustered_urls, i)