Merge pull request #93 from paoloRais/master

maciejkula · maciejkula · commit 9d848b7197cb · 2016-06-08T18:25:41.000+01:00
recall@k
diff --git a/lightfm/evaluation.py b/lightfm/evaluation.py
@@ -10,6 +10,7 @@
 
 
 __all__ = ['precision_at_k',
+           'recall_at_k',
            'auc_score',
            'reciprocal_rank']
 
@@ -72,6 +73,66 @@ def precision_at_k(model, test_interactions, train_interactions=None,
     return precision
 
 
+def recall_at_k(model, test_interactions, train_interactions=None,
+                k=10, user_features=None, item_features=None,
+                preserve_rows=False, num_threads=1):
+    """
+    Measure the recall at k metric for a model: the number of positive items in the first k
+    positions of the ranked list of results divided by the number of positive items
+    in the test period. A perfect score is 1.0.
+
+    Parameters
+    ----------
+
+    model: LightFM instance
+         the model to be evaluated
+    test_interactions: np.float32 csr_matrix of shape [n_users, n_items]
+         Non-zero entries representing known positives in the evaluation set.
+    train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional
+         Non-zero entries representing known positives in the train set. These
+         will be omitted from the score calulations to avoid re-recommending
+         known positives.
+    k: integer, optional
+         The k parameter.
+    user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional
+         Each row contains that user's weights over features.
+    item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional
+         Each row contains that item's weights over features.
+    preserve_rows: boolean, optional
+         When False (default), the number of rows in the output will be equal to
+         the number of users with interactions in the evaluation set. When True,
+         the number of rows in the output will be equal to the number of users.
+    num_threads: int, optional
+         Number of parallel computation threads to use. Should
+         not be higher than the number of physical cores.
+
+    Returns
+    -------
+
+    np.array of shape [n_users with interactions or n_users,]
+         Numpy array containing recall@k scores for each user. If there are no interactions
+         for a given user having items in the test period, the returned recall will be 0.
+    """
+
+    ranks = model.predict_rank(test_interactions,
+                               train_interactions=train_interactions,
+                               user_features=user_features,
+                               item_features=item_features,
+                               num_threads=num_threads)
+
+    ranks.data[ranks.data < k] = 1.0
+    ranks.data[ranks.data >= k] = 0.0
+
+    retrieved = np.squeeze(test_interactions.getnnz(axis=1))
+    hit = np.squeeze(np.array(ranks.sum(axis=1)))
+
+    if not preserve_rows:
+        hit = hit[test_interactions.getnnz(axis=1) > 0]
+        retrieved = retrieved[test_interactions.getnnz(axis=1) > 0]
+
+    return hit / retrieved
+
+
 def auc_score(model, test_interactions, train_interactions=None,
               user_features=None, item_features=None,
               preserve_rows=False, num_threads=1):
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
@@ -66,6 +66,45 @@ def _precision_at_k(model, ground_truth, k, train=None, user_features=None, item
     return sum(precisions) / len(precisions)
 
 
+def _recall_at_k(model, ground_truth, k, train=None, user_features=None,
+                 item_features=None):
+    # Alternative test implementation
+
+    ground_truth = ground_truth.tocsr()
+
+    no_users, no_items = ground_truth.shape
+
+    pid_array = np.arange(no_items, dtype=np.int32)
+
+    recalls = []
+
+    uid_array = np.empty(no_items, dtype=np.int32)
+
+    if train is not None:
+        train = train.tocsr()
+
+    for user_id, row in enumerate(ground_truth):
+        uid_array.fill(user_id)
+
+        predictions = model.predict(uid_array, pid_array,
+                                    user_features=user_features,
+                                    item_features=item_features,
+                                    num_threads=4)
+        if train is not None:
+            train_items = train[user_id].indices
+            top_k = set([x for x in np.argsort(-predictions)
+                         if x not in train_items][:k])
+        else:
+            top_k = set(np.argsort(-predictions)[:k])
+
+        true_pids = set(row.indices[row.data == 1])
+
+        if true_pids:
+            recalls.append(len(top_k & true_pids) / float(len(true_pids)))
+
+    return sum(recalls) / len(recalls)
+
+
 def _auc(model, ground_truth, train=None, user_features=None, item_features=None):
 
     ground_truth = ground_truth.tocsr()
@@ -143,6 +182,44 @@ def test_precision_at_k():
     assert np.allclose(precision.mean(), expected_mean_precision)
 
 
+def test_recall_at_k():
+
+    no_users, no_items = (10, 100)
+
+    train, test = _generate_data(no_users, no_items)
+
+    model = LightFM(loss='bpr')
+    model.fit_partial(train)
+
+    k = 10
+
+    # Without omitting train interactions
+    recall = evaluation.recall_at_k(model,
+                                    test,
+                                    k=k)
+    expected_mean_recall = _recall_at_k(model,
+                                        test,
+                                        k)
+
+    assert np.allclose(recall.mean(), expected_mean_recall)
+    assert len(recall) == (test.getnnz(axis=1) > 0).sum()
+    assert len(evaluation.recall_at_k(model,
+                                      train,
+                                      preserve_rows=True)) == test.shape[0]
+
+    # With omitting train interactions
+    recall = evaluation.recall_at_k(model,
+                                    test,
+                                    k=k,
+                                    train_interactions=train)
+    expected_mean_recall = _recall_at_k(model,
+                                        test,
+                                        k,
+                                        train=train)
+
+    assert np.allclose(recall.mean(), expected_mean_recall)
+
+
 def test_auc_score():
 
     no_users, no_items = (10, 100)