NetflixRecommenderSystem/RecommenderSystem.py at master · fukit0/NetflixRecommenderSystem · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# (1)  when the argument is a numpy array, np.sum ultimately calls add.reduce to do the work.
#      The overhead of handling its argument and dispatching to add.reduce is why np.sum is slower.

import time

import numpy as np
import pandas
import scipy.stats.stats as pearsonr
import buildUserWeightMatrix # C'ye compile ettiğimiz dosyayı import ediyoruz

knn = 50
startTime = time.time()

columnNames = ['movieID', 'userID', 'rating']

trainData = pandas.read_csv('TrainingRatings.txt', names=columnNames,
							dtype={'movieID': np.str, 'userID': np.str, 'rating': np.float})

# gets first column for movie ids and second for user ids
listOfMovieIDs = trainData.movieID.tolist()
listOfUserIDs = trainData.userID.tolist()

# unique movie IDs
# pandas.unique is faster then numpy.unique and return_index= true by default
uniqueListOfMovieIds = pandas.unique(listOfMovieIDs)
numberOfMovies = uniqueListOfMovieIds.size
tempMovieEnum = dict(enumerate(uniqueListOfMovieIds))
movieEnum = {v: k for k, v in
             tempMovieEnum.items()}  # inverse tempMovieEnum because enumerate does not return ordered way

# unique user IDs
uniqueListOfUserIDs = pandas.unique(listOfUserIDs)
numberOfUsers = uniqueListOfUserIDs.size
tempUserEnum = dict(enumerate(uniqueListOfUserIDs))
userEnum = {v: k for k, v in tempUserEnum.items()}  #inverse tempUserEnum because enumerate does not return ordered way

# movie X user matrix
movieUserRatingMatrix = np.empty(shape=(numberOfMovies, numberOfUsers), dtype=np.float)

# user X user weight matrix (r)
userWeightMatrix = np.empty(shape=(numberOfUsers, numberOfUsers), dtype=float)

############################################## FUNCTIONS DEFINITIONS #############################################

def buildMovieUserRatingMatrix( data ):
	for index, row in data.iterrows():
		movieIndex = movieEnum[row['movieID']]
		userIndex = userEnum[row['userID']]
		movieUserRatingMatrix[movieIndex][userIndex] = row['rating']

	return

# returns root mean square error value
def rmse(predictions, targets):
	return np.sqrt(((predictions - targets) ** 2).mean())


def getMeanRating(ratings):

	ratingArr = []

	for r in range(0,len(ratings)):
		if r > 0:
			ratingArr.append(r)

	return  np.asarray(ratingArr).mean()

# Predict rating for user <userIndex>, for movie <movieIndex>
def predictAndCompareUserRating(testData,weightMatrix,knn):

	predictedMovies = np.zeros([len(testData)], dtype=[('movieID', '|S10'), ('userID', '|S10'), ('predictedRating', 'f4')])
	predictionCounter = 0
	recommendations = np.zeros([numberOfMovies], dtype=[('movieID', '|S10'), ('userID', '|S10')])
	recommendationCounter = 0
	maeValue = 0

	rmsePredictions = []
	rmseExpecteds = []

	for index, row in testData.iterrows():
		# test dosyasından  satır okunuyor
		movieID = row['movieID']
		movieIndex = movieEnum[movieID]
		userID = row['userID']
		userIndex = userEnum[userID]
		expectedRating = row['rating']

		meanRatingOfActiveUser = getMeanRating(movieUserRatingMatrix[:,userIndex])

		numerator = 0
		denumerator = 0

		if userIndex < 100:
			# diğer userlar ile olan ağırlıkları büyükten küçüğe sıralarnı ve knn kadar user alınır
			sortedMostSimilarUsers = np.argsort(weightMatrix[userIndex][:])[::-1][:knn]

			for similarUserIndex in range(0, knn):
				#benzer kullanıcı indexini al
				similarUserIndex = sortedMostSimilarUsers[similarUserIndex]

				#benzer kullanıcının verdiği oyların ortalaması
				meanRatingOfSimilarUser = getMeanRating(movieUserRatingMatrix[:, similarUserIndex])
				#benzer kullanıcının movieID idli filme verdiği oy
				similarUserRatingForMovie = movieUserRatingMatrix[movieIndex][similarUserIndex]

				numerator += (similarUserRatingForMovie - meanRatingOfSimilarUser) * weightMatrix[userIndex][
					similarUserIndex]
				denumerator += weightMatrix[userIndex][similarUserIndex]

			predictedRating = meanRatingOfActiveUser + numerator / denumerator
			predictedMovies[predictionCounter] = ((str(movieID), str(userID), predictedRating))
			predictionCounter += 1
			maeValue += abs(predictedRating - expectedRating) # farkın mutlak değeri
			# rmse hesaplaması için değerler dizilere ekleniyor
			rmsePredictions.append(predictedRating)
			rmseExpecteds.append(expectedRating)

			# film tahminleme
			if predictedRating > 4:
				recommendations[recommendationCounter] = (str(movieID), str(userID))
				recommendationCounter += 1


	np.savetxt('PredictRatings.txt', predictedMovies,delimiter=',',newline='\n', fmt='%s,%s,%f')
	np.savetxt('RecommendMovie.txt', recommendations, delimiter=',',newline='\n', fmt='%s,%s')
	print("MAE  : " + str(maeValue/predictionCounter))
	print("RMSE : " + str(rmse(np.asarray(rmsePredictions),np.asarray(rmseExpecteds))))
	return

############################################ END OF FUNCTIONS DEFINITIONS ########################################

# kullanıcılar tarafından herbir filme verilen puanlar tutuluyor
buildMovieUserRatingMatrix(trainData)
print("Movie-user matrix build time is %s" % (time.time() - startTime))

# kullanıcılar arasındaki ağırlık matrisi oluşturulur
# C ye çevrilerek programın hızlandırılması amaçlanmıştır
weightMatrix = buildUserWeightMatrix.buildWeightMatrixBetweenUsers(movieUserRatingMatrix,numberOfUsers)
#print(weightMatrix[0:10, 0:10])
print("Execution time: %s seconds." % (time.time() - startTime))


# read testRatings
testData = pandas.read_csv('TestingRatings.txt', names=columnNames, dtype={'movieID': np.str, 'userID': np.str, 'rating': np.float})
#print(len(testData))
predictAndCompareUserRating(testData,weightMatrix,knn)
# program execution time takes about 55 hours