ibbyzj
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎DictCollabListening/anydbmTEST.py
+23 b/‎DictCollabListening/anydbmTEST.py
+23
diff --git a/‎DictCollabListening/millionUserShelf.py
+29-49 b/‎DictCollabListening/millionUserShelf.py
+29-49
diff --git a/‎DictCollabListening/songShelf.db
24 KB b/‎DictCollabListening/songShelf.db
24 KB
diff --git a/‎DictCollabListening/songs.db
3.88 MB b/‎DictCollabListening/songs.db
3.88 MB
diff --git a/‎DictCollabListening/songsTest.db
24 KB b/‎DictCollabListening/songsTest.db
24 KB
diff --git a/‎Kunal/fillLines500Matts.py
+52 b/‎Kunal/fillLines500Matts.py
+52
diff --git a/‎Kunal/validate_submission.py
+198 b/‎Kunal/validate_submission.py
+198
diff --git a/‎graphSolution/buildGraph.py
+27 b/‎graphSolution/buildGraph.py
+27
@@ -0,0 +1 @@
+*.txt
@@ -0,0 +1,23 @@
+#should create file with song number as line number; each line contains print of dictionary which should read {otherSong:timesPlayed, otherSong2:times2Played ... }
+import fileinput
+import anydbm
+db = anydbm.open('songsTest.db', 'c')
+user_id = 1
+for line in fileinput.input(["userHistoriesTESTFILE.txt"]):
+    history = line.rstrip('\n').split()
+	x = len(history)
+	for i in range(x):
+		songDict = {}
+		if db.has_key(str(i)):
+			songDict = eval(db[str(i)])
+		for j in range(x):
+			if i <> j:
+				if songDict.has_key(j):
+					songDict[j] +=1
+				else:
+					songDict[j] = 1
+		db[str(i)]=str(songDict)
+	print(str(len(db.keys())))
+    user_id+=1
+user_id = 1
+db.close
@@ -1,62 +1,42 @@
 #should create file with song number as line number; each line contains print of dictionary which should read {otherSong:timesPlayed, otherSong2:times2Played ... }
 import sys
 import fileinput
+import anydbm
 from collections import defaultdict
-newfile = open("userReccomendations.txt","w")
+db = anydbm.open('songs.db','c')
 user_id=1
-songsDict = defaultdict(lambda: defaultdict(int))
-with open("userHistoriesTrainingSet.txt") as myfile:
-	for line in myfile:
-		history = line.rstrip('\n').split()
-		x = len(history)
-		for i in range(x):
-			for j in range(x):
-				if i <> j:
-					songsDict[history[i]][history[j]] += 1
-		print(str(user_id*100/1000000)+"%...training..dict...");
-		user_id+=1;
-user_id = 1
-for line in fileinput.input(['userHistoriesFixed.txt']):
-	temp = line.rstrip('\n')
-	history = temp.split()
+for line in fileinput.input(["userHistoriesTrainingSet.txt"]):
+	history = line.rstrip('\n').split()
 	x = len(history)
 	for i in range(x):
+		songDict = {}
+		if db.has_key(str(i)):
+			songDict = eval(db[str(i)])
 		for j in range(x):
 			if i <> j:
-				songsDict[history[i]][history[j]] += 1
-	print(str(user_id*100/110000)+"%...test..dict...");
+				if songDict.has_key(j):
+					songDict[j] +=1
+				else:
+					songDict[j] = 1
+		db[str(i)]=str(songDict)
+	print(str(user_id)+"users...training..dict..." + str(len(db))+"songs...training..dict...");
 	user_id+=1;
-userNum = 1
-for line in fileinput.input(['userHistoriesFixed.txt']):
-	userDict = defaultdict(int)
+user_id = 1
+for line in fileinput.input(["userHistoriesFixed.txt"]):
 	history = line.rstrip('\n').split()
 	x = len(history)
 	for i in range(x):
-		for key in songsDict[history[i]]:
-			if not(key in history):
-				userDict[key] += songsDict[history[i]][key]
-	sortedDict = [x for x in userDict.iteritems()]
-	sortedDict.sort(key=lambda x: x[1], reverse=True)
-	printString = ''
-	if len(sortedDict)<500:
-		for i in range(len(sortedDict)):
-			printString += (str(sortedDict[i][0]) + ' ')
-		emptySpace = 500 - len(sortedDict)
-		isFull = False
-		f=open('popularSongs.txt',r)
-		for line in f.readlines():
-			if emptySpace <= 0:
-				isFull = True
-			if not isFull:
-				nextSong = line.rstrip('\n')
-				if not(nextSong in userDict.iterkeys):
-					printString += (str(nextSong) + ' ')
-					emptySpace -= 1
-		f.close()
-	else:
-		for i in range(500):
-			printString += (str(sortedDict[i][0]) + ' ')
-	newfile.write(printString + '\n')
-	userNum +=1
-	print(str(userNum*100/110000)+"%...recs...");
-newfile.close()
+		songDict = {}
+		if db.has_key(str(i)):
+			songDict = eval(db[str(i)])
+		for j in range(x):
+			if i <> j:
+				if songDict.has_key(j):
+					songDict[j] +=1
+				else:
+					songDict[j] = 1
+		db[str(i)]=str(songDict)
+	print(str(user_id*100/1000000)+"%...training..dict...");
+	user_id+=1;
+user_id = 1
+db.close
@@ -0,0 +1,52 @@
+import sys
+import linecache
+import fileinput
+fullLinesFile = open('KunalsReccomendationsFilledByMatts.txt','w')
+lineNum = 1
+for line in fileinput.input(['KunalsReccomendations.txt']):
+	songList = []
+	printString = ''
+	songs = line.rstrip().split(' ')
+	for i in range(len(songs)):
+		songList.append(songs[i])
+	while len(songList) < 500:
+		#print('adding to line ' + str(lineNum))
+		fillSongs = linecache.getline('userFullRecs2.txt', lineNum).rstrip('\n').split()
+		itersongs = 0
+		while len(songList) < 500:
+			testSong = fillSongs[itersongs]
+			checkV = True
+			for i in range(len(songList)):
+				if songList[i] == testSong:
+					checkV	= False
+			if checkV:
+				songList.append(testSong)
+			itersongs += 1
+	incSong = 1
+	while len(songList) < 500:
+		filler = open('popularSongs.txt','r')
+		#print('adding to line ' + str(lineNum))
+		testSong = filler.readline().rstrip('\n')
+		checkV = True
+		for i in range(len(songList)):
+			if songList[i] == testSong:
+				checkV	= False
+		if checkV:
+			songList.append(testSong)
+		filler.close
+	incSong = 1
+	while len(songList) < 500:
+		#print('adding to line ' + str(lineNum))
+		testSong = incSong
+		checkV = True
+		for i in range(len(songList)):
+			if songList[i] == testSong:
+				checkV	= False
+		if checkV:
+			songList.append(testSong)
+		incSong +=1
+	for i in range(500):
+		printString += (str(songList[i]) + ' ')
+	fullLinesFile.write(printString.lstrip(' ') + '\n')
+	lineNum += 1
+fullLinesFile.close()
@@ -0,0 +1,198 @@
+#!/usr/bin/env python
+"""
+Thierry Bertin-Mahieux (2012) Columbia University
+[email protected]
+
+Code to validate a submission file for the Million Song Dataset
+Challenge on Kaggle. Requires an internet connection.
+This code is developed under python 2.7 (Ubuntu machine).
+
+Copyright 2012, Thierry Bertin-Mahieux
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+__author__ = 'Thierry Bertin-Mahieux <[email protected]>'
+__date__ = 'Sun Mar 11 18:39:03 EDT 2012'
+
+
+import os
+import sys
+import time
+import urllib2
+
+
+# Number of predicted songs required per user.
+CUTOFF = 500
+
+# Million Song Dataset website file directory.
+HTML_PREFIX = 'http://labrosa.ee.columbia.edu/millionsong/sites/default/files/'
+
+# Canonical list of users for the contest, there should be predictions for
+# each user, one user per line, users are in the same order as in this file.
+CANONICAL_USER_LIST = '%s%s' % (HTML_PREFIX,
+                                'challenge/canonical/kaggle_users.txt')
+
+# Canonical list of songs and their integer index.
+CANONICAL_SONG_LIST = '%s%s' % (HTML_PREFIX,
+                                'challenge/canonical/kaggle_songs.txt')
+
+
+def load_list_from_the_web(url):
+    """Grab a text file, return each line in a list."""
+    print '---retrieveing url %s...' % url
+    t1 = time.time()
+    stream = urllib2.urlopen(url)
+    data = [l.strip() for l in stream.readlines()]
+    stream.close()
+    print '    DONE! It took %d seconds.' % int(time.time() - t1)
+    return data
+
+
+def print_error_message(msg, line_num=None):
+    """Formatted error message."""
+    prefix = 'ERROR! '
+    if line_num:
+        prefix += '[line %d] ' % line_num
+    print '%s%s' % (prefix, msg)
+
+
+def validate_one_line(line, line_num, min_max_song_indexes):
+    """Make sure an individual line looks valid, return True if so."""
+    is_valid = True
+    min_index, max_index = min_max_song_indexes
+    assert min_index == 1, 'Problem, minimum song index is not 1.'
+    # Line too small or empty?
+    if len(line) < 500:
+        print_error_message("Line too short! (%d characters)" % len(line),
+                            line_num)
+        is_valid = False
+    parts = line.split(' ')
+    # Not the right number of items per line?
+    if len(parts) != CUTOFF:
+        msg = "Line should have %d one-space-separated elements, " % CUTOFF
+        msg += "found %d" % len(parts)
+        print_error_message(msg, line_num)
+        is_valid = False
+    for song_index in parts:
+        # Is the song an integer?
+        try:
+            index = int(song_index)
+        except ValueError:
+            if len(song_index) == 18 and song_index[:2] == 'SO':
+                msg = 'Predicted songs should be integers, not SO...'
+                msg += 'Found: %s' % song_index
+                print_error_message(msg, line_num)
+            else:
+                msg = 'Found non-integer song ID: %s' % song_index
+                print_error_message(msg, line_num)
+            is_valid = False
+            break
+        # Is it 0-indexed instead of 1?
+        if index == 0:
+            msg = 'Found song index 0, song indexes start at 1.'
+            print_error_message(msg, line_num)
+            is_valid = False
+            break
+        # Is the index a valid integer?
+        elif index < 1 or index > max_index:
+            msg = 'Found song index %d, ' % index
+            msg += 'it should be between 1 and %d.' % max_index
+            print_error_message(msg, line_num)
+            is_valid = False
+            break
+    # Are there song duplicates?
+    if is_valid:
+        if len(set(parts[1:])) != len(parts[1:]):
+            msg = 'There is at least one song ID duplicate.'
+            print_error_message(msg, line_num)
+            is_valid = False
+    # Done.
+    return is_valid
+
+
+def main(argv):
+    """Validate the submission from canonical files fetched online."""
+
+    # Sanity check on the file.
+    submission_filename = argv[1]
+    if not os.path.isfile(submission_filename):
+        print 'ERROR: file %s does not exist.' % submission_filename
+        die_with_usage()
+
+    # Fetch data files.
+    users = load_list_from_the_web(CANONICAL_USER_LIST)
+    songs_and_indexes = load_list_from_the_web(CANONICAL_SONG_LIST)
+
+    # Check user file.
+    assert len(users) == 110000, 'Problem with the online user file.'
+    for user in users:
+        assert len(user) == 40, '%s' % (
+            'Problem with the online user file (user: %s).' % user, )
+
+    print '***************************************'
+    print '**********ANALYZING SUBMISSION*********'
+
+    # Extract indexes from the list of songs.
+    indexes = [int(line.split(' ')[1]) for line in songs_and_indexes]
+    min_index = min(indexes)
+    max_index = max(indexes)
+    msg_song_file_prob = 'Problem with the online song file, aborting.'
+    assert min_index == 1, msg_song_file_prob
+    assert max_index == len(indexes), msg_song_file_prob
+    min_max_index = (min_index, max_index)
+
+    # Keep stats
+    submission_is_valid = True
+
+    # Go through each line, validates it, keep some stats.
+    line_number = 0
+    fIn = open(submission_filename, 'r')
+    for line in fIn.xreadlines():
+        line_number += 1
+        submission_is_valid = validate_one_line(line.strip(),
+                                                line_number,
+                                                min_max_index)
+        if not submission_is_valid:
+            fIn.close()
+            sys.exit(0)
+    fIn.close()
+
+    # Final message.
+    if submission_is_valid:
+        print '***************************************'
+        print 'Awesome, your submission is good to go!'
+        sys.exit(0)    
+
+
+def die_with_usage():
+    """Help menu."""
+    print 'MSD CHallenge: script to validate your submission to Kaggle.'
+    print '(you need an internet connection)'
+    print '------------------------------------------------------------'
+    print ''
+    print 'python validate_submission.py <submission file>'
+    print ''
+    print 'ARGS'
+    print '   <submission file>   File to be uploaded to Kaggle.'
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+
+    # Display the help menu and quit?
+    HELP_KEYWORDS = ('help', '-help', '--help')
+    if len(sys.argv) < 2 or sys.argv[1].lower() in HELP_KEYWORDS:
+        die_with_usage()
+
+    main(sys.argv)
@@ -0,0 +1,27 @@
+import networkx as nx
+import itertools
+import fileinput
+
+G = nx.Graph()
+
+""" Adds an edge w/ weight 1, increments if already exists
+    Uses graph G if left unspecified"""
+def addNode(node1, node2, iWeight=1, iG=G):
+    if iG.has_edge(node1, node2):
+        iG[node1][node2]['weight'] += iWeight
+    else:
+        iG.add_edge(node1,node2, weight = iWeight)
+
+""" Addes edges from list of nodes (weight == 1) (for list [a b c] adds (a,b) (a,c) and (b,c) """
+def edgesFromNodes(nodesList):
+    tempEdges = itertools.combinations(nodesList, 2)
+    for x in tempEdges:
+        addNode(x[0],x[1])
+
+counter = 1
+for line in fileinput.input(["userHistoriesFixed.txt"]):
+    history = line.rstrip('\n').split()
+    edgesFromNodes(history)
+    print(str(counter) + "..." + str(G.number_of_nodes()) + "..." + str(G.number_of_edges()))
+    counter += 1
+nx.write_weighted_edgelist(G, 'testGraph.txt')