add more data to repo

kaiwudufe · Jul 29, 2012 · 71c4997 · 71c4997
1 parent 063039f
commit 71c4997
Show file tree

Hide file tree

Showing 8 changed files with 1,010,591 additions and 0 deletions.
diff --git a/ch07/README b/ch07/README
@@ -0,0 +1,159 @@
+SUMMARY
+================================================================================
+
+These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
+made by 6,040 MovieLens users who joined MovieLens in 2000.
+
+USAGE LICENSE
+================================================================================
+
+Neither the University of Minnesota nor any of the researchers
+involved can guarantee the correctness of the data, its suitability
+for any particular purpose, or the validity of results based on the
+use of the data set.  The data set may be used for any research
+purposes under the following conditions:
+
+     * The user may not state or imply any endorsement from the
+       University of Minnesota or the GroupLens Research Group.
+
+     * The user must acknowledge the use of the data set in
+       publications resulting from the use of the data set, and must
+       send us an electronic or paper copy of those publications.
+
+     * The user may not redistribute the data without separate
+       permission.
+
+     * The user may not use this information for any commercial or
+       revenue-bearing purposes without first obtaining permission
+       from a faculty member of the GroupLens Research Project at the
+       University of Minnesota.
+
+If you have any further questions or comments, please contact Sean McNee
+<[email protected]>. 
+
+ACKNOWLEDGEMENTS
+================================================================================
+
+Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
+set.
+
+FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
+================================================================================
+
+The GroupLens Research Project is a research group in the Department of 
+Computer Science and Engineering at the University of Minnesota. Members of 
+the GroupLens Research Project are involved in many research projects related 
+to the fields of information filtering, collaborative filtering, and 
+recommender systems. The project is lead by professors John Riedl and Joseph 
+Konstan. The project began to explore automated collaborative filtering in 
+1992, but is most well known for its world wide trial of an automated 
+collaborative filtering system for Usenet news in 1996. Since then the project 
+has expanded its scope to research overall information filtering solutions, 
+integrating in content-based methods as well as improving current collaborative 
+filtering technology.
+
+Further information on the GroupLens Research project, including research 
+publications, can be found at the following web site:
+
+        http://www.grouplens.org/
+
+GroupLens Research currently operates a movie recommender based on 
+collaborative filtering:
+
+        http://www.movielens.org/
+
+RATINGS FILE DESCRIPTION
+================================================================================
+
+All ratings are contained in the file "ratings.dat" and are in the
+following format:
+
+UserID::MovieID::Rating::Timestamp
+
+- UserIDs range between 1 and 6040 
+- MovieIDs range between 1 and 3952
+- Ratings are made on a 5-star scale (whole-star ratings only)
+- Timestamp is represented in seconds since the epoch as returned by time(2)
+- Each user has at least 20 ratings
+
+USERS FILE DESCRIPTION
+================================================================================
+
+User information is in the file "users.dat" and is in the following
+format:
+
+UserID::Gender::Age::Occupation::Zip-code
+
+All demographic information is provided voluntarily by the users and is
+not checked for accuracy.  Only users who have provided some demographic
+information are included in this data set.
+
+- Gender is denoted by a "M" for male and "F" for female
+- Age is chosen from the following ranges:
+
+	*  1:  "Under 18"
+	* 18:  "18-24"
+	* 25:  "25-34"
+	* 35:  "35-44"
+	* 45:  "45-49"
+	* 50:  "50-55"
+	* 56:  "56+"
+
+- Occupation is chosen from the following choices:
+
+	*  0:  "other" or not specified
+	*  1:  "academic/educator"
+	*  2:  "artist"
+	*  3:  "clerical/admin"
+	*  4:  "college/grad student"
+	*  5:  "customer service"
+	*  6:  "doctor/health care"
+	*  7:  "executive/managerial"
+	*  8:  "farmer"
+	*  9:  "homemaker"
+	* 10:  "K-12 student"
+	* 11:  "lawyer"
+	* 12:  "programmer"
+	* 13:  "retired"
+	* 14:  "sales/marketing"
+	* 15:  "scientist"
+	* 16:  "self-employed"
+	* 17:  "technician/engineer"
+	* 18:  "tradesman/craftsman"
+	* 19:  "unemployed"
+	* 20:  "writer"
+
+MOVIES FILE DESCRIPTION
+================================================================================
+
+Movie information is in the file "movies.dat" and is in the following
+format:
+
+MovieID::Title::Genres
+
+- Titles are identical to titles provided by the IMDB (including
+year of release)
+- Genres are pipe-separated and are selected from the following genres:
+
+	* Action
+	* Adventure
+	* Animation
+	* Children's
+	* Comedy
+	* Crime
+	* Documentary
+	* Drama
+	* Fantasy
+	* Film-Noir
+	* Horror
+	* Musical
+	* Mystery
+	* Romance
+	* Sci-Fi
+	* Thriller
+	* War
+	* Western
+
+- Some MovieIDs do not correspond to a movie due to accidental duplicate
+entries and/or test entries
+- Movies are mostly entered by hand, so errors and inconsistencies may exist
diff --git a/ch07/analysis.py b/ch07/analysis.py
@@ -0,0 +1,80 @@
+from pandas import *
+from pandas.util.decorators import cache_readonly
+import numpy as np
+import os
+
+base = 'ml-100k'
+
+class IndexedFrame(object):
+    """
+
+    """
+
+    def __init__(self, frame, field):
+        self.frame = frame
+
+    def _build_index(self):
+        pass
+
+class Movielens(object):
+
+    def __init__(self, base='ml-100k'):
+        self.base = base
+
+    @cache_readonly
+    def data(self):
+        names = ['user_id', 'item_id', 'rating', 'timestamp']
+        path = os.path.join(self.base, 'u.data')
+        return read_table(path, header=None, names=names)
+
+    @cache_readonly
+    def users(self):
+        names = ['user_id', 'age', 'gender', 'occupation', 'zip']
+        path = os.path.join(self.base, 'u.user')
+        return read_table(path, sep='|', header=None, names=names)
+
+    @cache_readonly
+    def items(self):
+        names = ['item_id', 'title', 'release_date', 'video_date',
+                 'url', 'unknown', 'Action', 'Adventure', 'Animation',
+                 "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama',
+                 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
+                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
+        path = os.path.join(self.base, 'u.item')
+        return read_table(path, sep='|', header=None, names=names)
+
+    @cache_readonly
+    def genres(self):
+        names = ['name', 'id']
+        path = os.path.join(self.base, 'u.genre')
+        data = read_table(path, sep='|', header=None, names=names)[:-1]
+        return Series(data.name, data.id)
+
+    @cache_readonly
+    def joined(self):
+        merged = merge(self.data, self.users)
+        merged = merge(merged, self.items)
+        return merged
+
+    def movie_stats(self, title):
+        data = self.joined[self.joined.title == title]
+
+        return data.groupby('gender').rating.mean()
+
+def biggest_gender_discrep(data):
+    nobs = data.pivot_table('rating', rows='title',
+                            cols='gender', aggfunc=len, fill_value=0)
+    mask = (nobs.values > 10).all(1)
+    titles = nobs.index[mask]
+
+    mean_ratings = data.pivot_table('rating', rows='title',
+                                    cols='gender', aggfunc='mean')
+    mean_ratings = mean_ratings.ix[titles]
+
+    diff = mean_ratings.M - mean_ratings.F
+    return diff[np.abs(diff).argsort()[::-1]]
+
+buckets = [0, 18, 25, 35, 50, 80]
+
+ml = Movielens()
+title = 'Cable Guy, The (1996)'
diff --git a/ch07/foods-2011-10-03.json b/ch07/foods-2011-10-03.json