Skip to content

Commit 6218890

Browse files
author
Matt C
committed
Haven't commited in a while, so here we go
1 parent 3873681 commit 6218890

10 files changed

+353
-49
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.txt

DictCollabListening/anydbmTEST.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#should create file with song number as line number; each line contains print of dictionary which should read {otherSong:timesPlayed, otherSong2:times2Played ... }
2+
import fileinput
3+
import anydbm
4+
db = anydbm.open('songsTest.db', 'c')
5+
user_id = 1
6+
for line in fileinput.input(["userHistoriesTESTFILE.txt"]):
7+
history = line.rstrip('\n').split()
8+
x = len(history)
9+
for i in range(x):
10+
songDict = {}
11+
if db.has_key(str(i)):
12+
songDict = eval(db[str(i)])
13+
for j in range(x):
14+
if i <> j:
15+
if songDict.has_key(j):
16+
songDict[j] +=1
17+
else:
18+
songDict[j] = 1
19+
db[str(i)]=str(songDict)
20+
print(str(len(db.keys())))
21+
user_id+=1
22+
user_id = 1
23+
db.close
+29-49
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,42 @@
11
#should create file with song number as line number; each line contains print of dictionary which should read {otherSong:timesPlayed, otherSong2:times2Played ... }
22
import sys
33
import fileinput
4+
import anydbm
45
from collections import defaultdict
5-
newfile = open("userReccomendations.txt","w")
6+
db = anydbm.open('songs.db','c')
67
user_id=1
7-
songsDict = defaultdict(lambda: defaultdict(int))
8-
with open("userHistoriesTrainingSet.txt") as myfile:
9-
for line in myfile:
10-
history = line.rstrip('\n').split()
11-
x = len(history)
12-
for i in range(x):
13-
for j in range(x):
14-
if i <> j:
15-
songsDict[history[i]][history[j]] += 1
16-
print(str(user_id*100/1000000)+"%...training..dict...");
17-
user_id+=1;
18-
user_id = 1
19-
for line in fileinput.input(['userHistoriesFixed.txt']):
20-
temp = line.rstrip('\n')
21-
history = temp.split()
8+
for line in fileinput.input(["userHistoriesTrainingSet.txt"]):
9+
history = line.rstrip('\n').split()
2210
x = len(history)
2311
for i in range(x):
12+
songDict = {}
13+
if db.has_key(str(i)):
14+
songDict = eval(db[str(i)])
2415
for j in range(x):
2516
if i <> j:
26-
songsDict[history[i]][history[j]] += 1
27-
print(str(user_id*100/110000)+"%...test..dict...");
17+
if songDict.has_key(j):
18+
songDict[j] +=1
19+
else:
20+
songDict[j] = 1
21+
db[str(i)]=str(songDict)
22+
print(str(user_id)+"users...training..dict..." + str(len(db))+"songs...training..dict...");
2823
user_id+=1;
29-
userNum = 1
30-
for line in fileinput.input(['userHistoriesFixed.txt']):
31-
userDict = defaultdict(int)
24+
user_id = 1
25+
for line in fileinput.input(["userHistoriesFixed.txt"]):
3226
history = line.rstrip('\n').split()
3327
x = len(history)
3428
for i in range(x):
35-
for key in songsDict[history[i]]:
36-
if not(key in history):
37-
userDict[key] += songsDict[history[i]][key]
38-
sortedDict = [x for x in userDict.iteritems()]
39-
sortedDict.sort(key=lambda x: x[1], reverse=True)
40-
printString = ''
41-
if len(sortedDict)<500:
42-
for i in range(len(sortedDict)):
43-
printString += (str(sortedDict[i][0]) + ' ')
44-
emptySpace = 500 - len(sortedDict)
45-
isFull = False
46-
f=open('popularSongs.txt',r)
47-
for line in f.readlines():
48-
if emptySpace <= 0:
49-
isFull = True
50-
if not isFull:
51-
nextSong = line.rstrip('\n')
52-
if not(nextSong in userDict.iterkeys):
53-
printString += (str(nextSong) + ' ')
54-
emptySpace -= 1
55-
f.close()
56-
else:
57-
for i in range(500):
58-
printString += (str(sortedDict[i][0]) + ' ')
59-
newfile.write(printString + '\n')
60-
userNum +=1
61-
print(str(userNum*100/110000)+"%...recs...");
62-
newfile.close()
29+
songDict = {}
30+
if db.has_key(str(i)):
31+
songDict = eval(db[str(i)])
32+
for j in range(x):
33+
if i <> j:
34+
if songDict.has_key(j):
35+
songDict[j] +=1
36+
else:
37+
songDict[j] = 1
38+
db[str(i)]=str(songDict)
39+
print(str(user_id*100/1000000)+"%...training..dict...");
40+
user_id+=1;
41+
user_id = 1
42+
db.close

DictCollabListening/songShelf.db

24 KB
Binary file not shown.

DictCollabListening/songs.db

3.88 MB
Binary file not shown.

DictCollabListening/songsTest.db

24 KB
Binary file not shown.

Kunal/fillLines500Matts.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import sys
2+
import linecache
3+
import fileinput
4+
fullLinesFile = open('KunalsReccomendationsFilledByMatts.txt','w')
5+
lineNum = 1
6+
for line in fileinput.input(['KunalsReccomendations.txt']):
7+
songList = []
8+
printString = ''
9+
songs = line.rstrip().split(' ')
10+
for i in range(len(songs)):
11+
songList.append(songs[i])
12+
while len(songList) < 500:
13+
#print('adding to line ' + str(lineNum))
14+
fillSongs = linecache.getline('userFullRecs2.txt', lineNum).rstrip('\n').split()
15+
itersongs = 0
16+
while len(songList) < 500:
17+
testSong = fillSongs[itersongs]
18+
checkV = True
19+
for i in range(len(songList)):
20+
if songList[i] == testSong:
21+
checkV = False
22+
if checkV:
23+
songList.append(testSong)
24+
itersongs += 1
25+
incSong = 1
26+
while len(songList) < 500:
27+
filler = open('popularSongs.txt','r')
28+
#print('adding to line ' + str(lineNum))
29+
testSong = filler.readline().rstrip('\n')
30+
checkV = True
31+
for i in range(len(songList)):
32+
if songList[i] == testSong:
33+
checkV = False
34+
if checkV:
35+
songList.append(testSong)
36+
filler.close
37+
incSong = 1
38+
while len(songList) < 500:
39+
#print('adding to line ' + str(lineNum))
40+
testSong = incSong
41+
checkV = True
42+
for i in range(len(songList)):
43+
if songList[i] == testSong:
44+
checkV = False
45+
if checkV:
46+
songList.append(testSong)
47+
incSong +=1
48+
for i in range(500):
49+
printString += (str(songList[i]) + ' ')
50+
fullLinesFile.write(printString.lstrip(' ') + '\n')
51+
lineNum += 1
52+
fullLinesFile.close()

Kunal/validate_submission.py

+198
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
#!/usr/bin/env python
2+
"""
3+
Thierry Bertin-Mahieux (2012) Columbia University
4+
5+
6+
Code to validate a submission file for the Million Song Dataset
7+
Challenge on Kaggle. Requires an internet connection.
8+
This code is developed under python 2.7 (Ubuntu machine).
9+
10+
Copyright 2012, Thierry Bertin-Mahieux
11+
12+
This program is free software: you can redistribute it and/or modify
13+
it under the terms of the GNU General Public License as published by
14+
the Free Software Foundation, either version 3 of the License, or
15+
(at your option) any later version.
16+
17+
This program is distributed in the hope that it will be useful,
18+
but WITHOUT ANY WARRANTY; without even the implied warranty of
19+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20+
GNU General Public License for more details.
21+
22+
You should have received a copy of the GNU General Public License
23+
along with this program. If not, see <http://www.gnu.org/licenses/>.
24+
"""
25+
__author__ = 'Thierry Bertin-Mahieux <[email protected]>'
26+
__date__ = 'Sun Mar 11 18:39:03 EDT 2012'
27+
28+
29+
import os
30+
import sys
31+
import time
32+
import urllib2
33+
34+
35+
# Number of predicted songs required per user.
36+
CUTOFF = 500
37+
38+
# Million Song Dataset website file directory.
39+
HTML_PREFIX = 'http://labrosa.ee.columbia.edu/millionsong/sites/default/files/'
40+
41+
# Canonical list of users for the contest, there should be predictions for
42+
# each user, one user per line, users are in the same order as in this file.
43+
CANONICAL_USER_LIST = '%s%s' % (HTML_PREFIX,
44+
'challenge/canonical/kaggle_users.txt')
45+
46+
# Canonical list of songs and their integer index.
47+
CANONICAL_SONG_LIST = '%s%s' % (HTML_PREFIX,
48+
'challenge/canonical/kaggle_songs.txt')
49+
50+
51+
def load_list_from_the_web(url):
52+
"""Grab a text file, return each line in a list."""
53+
print '---retrieveing url %s...' % url
54+
t1 = time.time()
55+
stream = urllib2.urlopen(url)
56+
data = [l.strip() for l in stream.readlines()]
57+
stream.close()
58+
print ' DONE! It took %d seconds.' % int(time.time() - t1)
59+
return data
60+
61+
62+
def print_error_message(msg, line_num=None):
63+
"""Formatted error message."""
64+
prefix = 'ERROR! '
65+
if line_num:
66+
prefix += '[line %d] ' % line_num
67+
print '%s%s' % (prefix, msg)
68+
69+
70+
def validate_one_line(line, line_num, min_max_song_indexes):
71+
"""Make sure an individual line looks valid, return True if so."""
72+
is_valid = True
73+
min_index, max_index = min_max_song_indexes
74+
assert min_index == 1, 'Problem, minimum song index is not 1.'
75+
# Line too small or empty?
76+
if len(line) < 500:
77+
print_error_message("Line too short! (%d characters)" % len(line),
78+
line_num)
79+
is_valid = False
80+
parts = line.split(' ')
81+
# Not the right number of items per line?
82+
if len(parts) != CUTOFF:
83+
msg = "Line should have %d one-space-separated elements, " % CUTOFF
84+
msg += "found %d" % len(parts)
85+
print_error_message(msg, line_num)
86+
is_valid = False
87+
for song_index in parts:
88+
# Is the song an integer?
89+
try:
90+
index = int(song_index)
91+
except ValueError:
92+
if len(song_index) == 18 and song_index[:2] == 'SO':
93+
msg = 'Predicted songs should be integers, not SO...'
94+
msg += 'Found: %s' % song_index
95+
print_error_message(msg, line_num)
96+
else:
97+
msg = 'Found non-integer song ID: %s' % song_index
98+
print_error_message(msg, line_num)
99+
is_valid = False
100+
break
101+
# Is it 0-indexed instead of 1?
102+
if index == 0:
103+
msg = 'Found song index 0, song indexes start at 1.'
104+
print_error_message(msg, line_num)
105+
is_valid = False
106+
break
107+
# Is the index a valid integer?
108+
elif index < 1 or index > max_index:
109+
msg = 'Found song index %d, ' % index
110+
msg += 'it should be between 1 and %d.' % max_index
111+
print_error_message(msg, line_num)
112+
is_valid = False
113+
break
114+
# Are there song duplicates?
115+
if is_valid:
116+
if len(set(parts[1:])) != len(parts[1:]):
117+
msg = 'There is at least one song ID duplicate.'
118+
print_error_message(msg, line_num)
119+
is_valid = False
120+
# Done.
121+
return is_valid
122+
123+
124+
def main(argv):
125+
"""Validate the submission from canonical files fetched online."""
126+
127+
# Sanity check on the file.
128+
submission_filename = argv[1]
129+
if not os.path.isfile(submission_filename):
130+
print 'ERROR: file %s does not exist.' % submission_filename
131+
die_with_usage()
132+
133+
# Fetch data files.
134+
users = load_list_from_the_web(CANONICAL_USER_LIST)
135+
songs_and_indexes = load_list_from_the_web(CANONICAL_SONG_LIST)
136+
137+
# Check user file.
138+
assert len(users) == 110000, 'Problem with the online user file.'
139+
for user in users:
140+
assert len(user) == 40, '%s' % (
141+
'Problem with the online user file (user: %s).' % user, )
142+
143+
print '***************************************'
144+
print '**********ANALYZING SUBMISSION*********'
145+
146+
# Extract indexes from the list of songs.
147+
indexes = [int(line.split(' ')[1]) for line in songs_and_indexes]
148+
min_index = min(indexes)
149+
max_index = max(indexes)
150+
msg_song_file_prob = 'Problem with the online song file, aborting.'
151+
assert min_index == 1, msg_song_file_prob
152+
assert max_index == len(indexes), msg_song_file_prob
153+
min_max_index = (min_index, max_index)
154+
155+
# Keep stats
156+
submission_is_valid = True
157+
158+
# Go through each line, validates it, keep some stats.
159+
line_number = 0
160+
fIn = open(submission_filename, 'r')
161+
for line in fIn.xreadlines():
162+
line_number += 1
163+
submission_is_valid = validate_one_line(line.strip(),
164+
line_number,
165+
min_max_index)
166+
if not submission_is_valid:
167+
fIn.close()
168+
sys.exit(0)
169+
fIn.close()
170+
171+
# Final message.
172+
if submission_is_valid:
173+
print '***************************************'
174+
print 'Awesome, your submission is good to go!'
175+
sys.exit(0)
176+
177+
178+
def die_with_usage():
179+
"""Help menu."""
180+
print 'MSD CHallenge: script to validate your submission to Kaggle.'
181+
print '(you need an internet connection)'
182+
print '------------------------------------------------------------'
183+
print ''
184+
print 'python validate_submission.py <submission file>'
185+
print ''
186+
print 'ARGS'
187+
print ' <submission file> File to be uploaded to Kaggle.'
188+
sys.exit(0)
189+
190+
191+
if __name__ == '__main__':
192+
193+
# Display the help menu and quit?
194+
HELP_KEYWORDS = ('help', '-help', '--help')
195+
if len(sys.argv) < 2 or sys.argv[1].lower() in HELP_KEYWORDS:
196+
die_with_usage()
197+
198+
main(sys.argv)

graphSolution/buildGraph.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import networkx as nx
2+
import itertools
3+
import fileinput
4+
5+
G = nx.Graph()
6+
7+
""" Adds an edge w/ weight 1, increments if already exists
8+
Uses graph G if left unspecified"""
9+
def addNode(node1, node2, iWeight=1, iG=G):
10+
if iG.has_edge(node1, node2):
11+
iG[node1][node2]['weight'] += iWeight
12+
else:
13+
iG.add_edge(node1,node2, weight = iWeight)
14+
15+
""" Addes edges from list of nodes (weight == 1) (for list [a b c] adds (a,b) (a,c) and (b,c) """
16+
def edgesFromNodes(nodesList):
17+
tempEdges = itertools.combinations(nodesList, 2)
18+
for x in tempEdges:
19+
addNode(x[0],x[1])
20+
21+
counter = 1
22+
for line in fileinput.input(["userHistoriesFixed.txt"]):
23+
history = line.rstrip('\n').split()
24+
edgesFromNodes(history)
25+
print(str(counter) + "..." + str(G.number_of_nodes()) + "..." + str(G.number_of_edges()))
26+
counter += 1
27+
nx.write_weighted_edgelist(G, 'testGraph.txt')

0 commit comments

Comments
 (0)