Source code for supervised_dl_class_v1

# -*- coding: utf-8 -*-
"""
supervised_dl_class.py
~~~~~~~~~~~~~~~~~~~~~~
.. topic:: Contents

    The supervised_dl_class module include the SupervisedDL class,
    fit and score function

    Created on Wed Jun 29 16:37:28 2016

    @authors: bisot, serizel

    .. [#] V.Bisot, R. Serizel, S. Essid, and G. Richard.
        "Feature Learning with Matrix Factorization Applied to
        Acoustic Scene Classification".
        Accepted for publication in *IEEE Transactions on Audio,
        Speech and Language Processing*, 2017

    .. [#] R. Serizel, V.Bisot, S. Essid, and G. Richard.
        “Supervised group nonnegative matrix factorisation with similarity
        constraints and applications to speaker identification”.
        In Proc. of *2017 IEEE International Conference on Acoustics,
        Speech and Signal Processing (ICASSP)*, 2017.
"""

import numpy as np
import spams
from sklearn import decomposition
import beta_ntf
from sklearn import preprocessing
from sklearn.metrics import accuracy_score,  f1_score
from sklearn.linear_model import LogisticRegression
import random
import time
import copy

from sklearn.utils.extmath import (
    logsumexp, log_logistic, safe_sparse_dot,
    softmax, squared_norm)
from sklearn.utils.validation import (
    DataConversionWarning,
    check_X_y, NotFittedError)
from sklearn.preprocessing import LabelEncoder, LabelBinarizer


[docs]class SupervisedDL(object):
    """ Supervised DL class

    Task-driven Dictionary Learning with modified algorithm

    Parameters
    ----------
    data : array, shape (n_samples, n_features)
        Training data matrix
        Needs to be provided if initialization is done in the model

    n_components : int
        Size of the dictionary

    n_labels : int
        number of classes

    pos : bool, default: True
        When set to True, the model is fit in its nonnegative formulation

    n_iter : int
        Number of epochs on which to run the algorithm

    lambda1 : float, default: 0.1
        Paramter controlling the l1 norm penality for the projection step

    lambda2 : float, default: 0
        Paramter controlling the l2 norm penality for the projection step

    rho : float, default: 0.001
        Initial gradient step parameter

    mu : float, default: 1
        Regularization strenght of the classifier; must be positive.
        Smaller values specify stronger regularization.

    agreg :  int, optional (default: 1)
        In the case classification is done on bags of agreg successive
        projections. Every successive group of agreg  projections
        (without overlapp) will be averaged before classification

    init :  str, {'random', 'nmf', 'dic-learning'}
        Controls the nature of the dictionary initialization.

        * Use 'random' for a random initalization of
        * Use 'nmf' for intializaing with nonnegative matrix factorization
        * Use 'dic-learning' to intilaize with the scikit-learn DictionaryLearning class

    n_iter_init : int, optional
        Number of iterations for the dictionary initialization

    max_iter_init : int, optional
        Maximum number of iterations for the classifier initialization

    max_iter_inloop : int, optional
        Maximum number of iterations at each epoch for the classifier update

    batch_size : int, optional
        Size of the batch (1 for stochastic gradient)

    cls : array (n_samples, 1)
        Class labels for the data

    ses_train : array (n_samples, 1)
        Session labels for the data
        (Leave to 0 if sessions are not used)

    sub_dict_size : int
        Size of the sub-dictionnaries related to a unique couple (cls, ses)
        See also [3]_

    k_cls : int
        Number of components that are affected to cls related bases.
        See also [3]_

    k_ses : int
        Number of components that are affected to ses related bases.
        See also [3]_

    nu1 : float
        Class similarity constraint
        See also [3]_

    nu2 : float
        Session similarity constraint
        See also [3]_

    verbose : int
        Set verbose to any positive number for verbosity.

    Attributes
    ----------
    clf : object
        Classifier

    D : array
        Dictionnary

    dist_ses : array
        Distance between bases related to the same session

    dist_cls :
        Distance between bases related to the same class

    cst : array
        Update constraint computed from dist_ses and dist_cls

    References
    ----------

    .. [#] R. Serizel, S. Essid, and G. Richard.
        “Group nonnegative matrix factorisation with speaker and session
        variability compensation for speaker identification”.
        In Proc. of *2016 IEEE International Conference on Acoustics,
        Speech and Signal Processing (ICASSP)*, pp. 5470-5474, 2016.

     """

    def __init__(self, data=np.asarray([[0, 0]]), n_components=64,
                 n_labels=2, pos=True, n_iter=1,
                 lambda1=0, lambda2=0, rho=0.001, verbose=0, mu=1, agreg=1,
                 init='random', n_iter_init=10, batch_size=6250,
                 max_iter_init=10, max_iter_inloop=1,
                 cls=np.asarray([[0, 0]]), ses_train=0,
                 sub_dict_size=1, k_cls=0, k_ses=0, nu1=0, nu2=0):

        self.data = data
        self.data_shape = data.shape
        self.n_components = n_components
        self.verbose = verbose
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.n_iter = n_iter
        self.n_labels = n_labels
        self.init = init
        self.rho = rho
        self.mu = mu
        self.agreg = agreg
        self.pos = pos
        self.analysis = np.zeros(shape=(n_iter, 5))
        self.batch_size = batch_size
        self.max_iter_init = max_iter_init
        self.max_iter_inloop = max_iter_inloop
        self.clf = LogisticRegression(
            C=self.mu, multi_class='multinomial',
            solver='lbfgs', max_iter=self.max_iter_init, warm_start=True)
        self.cls = cls
        self.sub_dict_size = sub_dict_size
        self.k_cls = k_cls
        self.k_ses = k_ses
        self.nu1 = nu1
        self.nu2 = nu2
        self.ses_train = ses_train

        if self.init == 'random':
            if self.pos is False:
                self.D = np.random.randn(self.data_shape[1],
                                         self.n_components)
            if self.pos is True:
                self.D = np.abs(np.random.randn(self.data_shape[1],
                                                self.n_components))
            self.D = preprocessing.normalize(self.D, axis=0)

        if self.init == 'NMF':
            if self.verbose > 0:
                print("Initializing dictionnary with beta_ntf")
            ntf = beta_ntf.BetaNTF(data_shape=self.data_shape,
                                   n_components=self.n_components,
                                   n_iter=n_iter_init, verbose=False, beta=2)
            ntf.fit(self.data)
            self.D = ntf.factors_[1]
            self.D = preprocessing.normalize(self.D, axis=0)

        if self.init == 'DictionnaryLearning':
            if self.verbose > 0:
                print(""" Initializing dictionnary
                          with sklearn DictionnaryLearning""")
            u_dl = decomposition.DictionaryLearning(
                n_components=self.n_components,
                alpha=0, max_iter=n_iter_init)
            u_dl.fit(self.data)
            self.D = preprocessing.normalize(
                u_dl.components_.T, axis=0)
        self.compute_cst()

[docs]    def compute_cst(self):
        """ Compute the update constraints based on the distance between
        session related bases and the distance between class related bases"""

        cls = self.cls
        cls_card = np.zeros((int(np.amax(self.cls[:, 0])), ))
        cls_sum = np.zeros((int(np.amax(self.cls[:, 0])),
                            self.D.shape[0], self.k_cls))
        self.dist_cls = 0
        for i in range(len(cls_card)):
            cls_card[i] = max(cls[cls[:, 0] == i].shape[0] - 1, 0)
            ref_sub_ind = np.arange(i*self.sub_dict_size,
                                    i*self.sub_dict_size + self.k_cls)
            for j in np.where(cls[cls[:, 0] == i])[0]:
                sub_ind = np.arange(j*self.sub_dict_size,
                                    j*self.sub_dict_size + self.k_cls)
                cls_sum[i, ] += self.D[:, sub_ind]
                self.dist_cls += np.linalg.norm(
                    self.D[:, sub_ind] - self.D[:, ref_sub_ind])

        ses_card = np.zeros((int(np.amax(self.cls[:, 1])), ))
        ses_sum = np.zeros((int(np.amax(self.cls[:, 1])),
                            self.D.shape[0], self.k_ses))
        self.dist_ses = 0
        for i in range(len(ses_card)):
            ses_card[i] = max(cls[cls[:, 1] == i].shape[0] - 1, 0)
            ref_sub_ind = np.arange(
                i*self.sub_dict_size + self.k_cls,
                i*self.sub_dict_size + self.k_cls + self.k_ses)
            for j in np.where(cls[cls[:, 1] == i])[0]:
                sub_ind = np.arange(
                    j*self.sub_dict_size + self.k_cls,
                    j*self.sub_dict_size + self.k_cls + self.k_ses)
                ses_sum[i, ] += self.D[:, sub_ind]
                self.dist_ses += np.linalg.norm(
                    self.D[:, sub_ind] - self.D[:, ref_sub_ind])

        self.cst = np.zeros((self.D.shape))
        for i in range(self.cls.shape[0]):
            cls = int(self.cls[i, 0]-1)
            ses = int(self.cls[i, 1]-1)
            sub_ind = np.arange(i*self.sub_dict_size,
                                (i+1)*self.sub_dict_size)
            D_sub = self.D[:, sub_ind]
            sub_cst = np.zeros((self.D.shape[0], self.sub_dict_size))

            sub_cst[:, :self.k_cls] = self.nu1 * (
                cls_card[cls] * D_sub[:, :self.k_cls] -
                (cls_sum[cls, ] - D_sub[:, :self.k_cls]))
            sub_cst[:, self.k_cls: self.k_cls+self.k_ses] = self.nu2 * (
                ses_card[ses] *
                D_sub[:, :self.k_cls:self.k_cls+self.k_ses] -
                (ses_sum[ses] - D_sub[:, :self.k_cls:self.k_cls+self.k_ses]))
            self.cst[:, sub_ind] = sub_cst

[docs]    def mean_frames(self, X=0, agreg=15):
        """
        Averages every successive group of agreg rows in the a matrix

        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            Matrix to reduce by averaging

        agreg : int
            Specifies size of the groups to average

        Returns
        -------
        X_mean: array, shape(n_samples/agreg, n_features)
            Averaged matrix
        """
        return np.mean(np.reshape(X, (X.shape[0]/agreg, agreg, -1)), axis=1)

[docs]    def project_data(self, X, agreg=1):
        """
        Projects data on the model dictionary

        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            Matrix to project on dictoonary

        agreg : int
            Specifies size of the groups to average after projection

        Returns
        -------
        projections: array, shape(n_samples/agreg, n_components)
            Projection matrix
        """
        alpha_mat = spams.lasso(np.asfortranarray(np.transpose(X)),
                                D=np.asfortranarray(self.D),
                                lambda1=self.lambda1,
                                lambda2=self.lambda2, mode=2,
                                pos=self.pos)
        alpha_mat = alpha_mat.toarray()
        if agreg > 1:
            return np.mean(
                np.reshape(
                    alpha_mat,
                    (alpha_mat.shape[0]/agreg, agreg, -1)),
                axis=1)
        else:
            return alpha_mat

[docs]    def predict(self, X):
        """
        Predicts labels from a given matrix using the model classifier

        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            Matrix to project on dictoonary

        Returns
        -------
        y_pred: array, shape(n_samples/agreg,)
            Predicted labels
        """
        alpha_mat = self.project_data(X=X, agreg=self.agreg)
        y_pred = self.clf.predict(alpha_mat)

        return y_pred

[docs]    def fit(self, X_train, y_train, X_test=np.array([]), y_test=np.array([])):
        """
        Fits the model to input training data

        Parameters
        ----------
        X_train : array, shape (n_samples, n_features)
            Training data matrix

        X_test : array, shape (n_samples_test, n_features), optional
            Test data matrix
            Usefull only to check performance during development

        y_train : array-like, shape (n_samples/self.agreg,)
            Target vector relative to X_train.

        y_train : array-like, shape (n_samples_test/self.agreg,)
            Target vector relative to X_test.

        Returns
        -------

        """
        for i in range(self.n_iter):

            # Classifier update step #

            tic = time.time()
            # Project full data on D
            alpha_mat = spams.lasso(
                np.asfortranarray(X_train.T),
                D=np.asfortranarray(self.D),
                lambda1=self.lambda1, lambda2=self.lambda2,
                mode=2, pos=self.pos)
            # Average projections if necessary
            alpha_mean = self.mean_frames(
                alpha_mat.toarray().T, agreg=self.agreg)
            alpha_mean = preprocessing.scale(alpha_mean, with_mean=False)

            # Update classifier
            self.clf.fit(alpha_mean, y_train)
            self.w = self.clf.coef_
            self.b = self.clf.intercept_
            self. compute_cst()
            if i == 0:
                # Classifier is initialized on first iteration
                # For further iterations, the LR class is only updated on
                # 1 iteration with warm restart
                self.clf.max_iter = self.max_iter_inloop

            # Print current performance #

            if self.verbose > 0:  # Print the scores
                print("Iteration number %i \n" % i)
                X2, y = check_X_y(
                    alpha_mean, y_train, accept_sparse='csr',
                    dtype=np.float64, order="C")
                lbin = LabelBinarizer()
                Y_binarized = lbin.fit_transform(y_train)
                yo = np.zeros(shape=(self.n_labels, 1))
                yo[:, 0] = self.clf.intercept_
                yo = np.concatenate((self.clf.coef_, yo), axis=1)
                if Y_binarized.shape[1] == 1:
                    Y_binarized = np.hstack([1 - Y_binarized, Y_binarized])
                ut = _multinomial_loss(
                    yo, X2,  Y_binarized, 1/self.mu,
                    sample_weight=np.ones(y.shape[0]))
                yo = np.zeros(shape=(self.n_labels, 1))
                print "Classification costs: ", ut[0]
                print(
                    "Class distance: ", self.dist_cls,
                    "Session distance:", self.dist_ses)
                if i == 0:
                    self.nu1 *= ut[0]/self.dist_cls
                    self.nu2 *= ut[0]/self.dist_ses
                    print "Weights nu1 and nu2", self.nu1, self.nu2
                    self.compute_cst()
                if X_test.any():
                    a, f1 = self.scores(X=X_test, y=y_test)
                    print(""" Classification scores on test set:
                              a=%0.3f   f1=%0.3f""" % (a, f1))
                a, f1 = self.scores(X=X_train, y=y_train)
                print(""" Classification scores on train set:
                          a=%0.3f   f1=%0.3f""" % (a, f1))

            # Dictionary update step #

            draw = range(self.agreg*len(y_train))
            random.shuffle(draw)
            nb_batch = len(draw)/int(self.batch_size) + 1
            for t in range(nb_batch):
                # Select and project a data point

                ind = draw[
                    t*self.batch_size: min((t+1)*self.batch_size, len(draw))]
                x_mat = np.transpose(X_train[ind, ])
                ind = [x/self.agreg for x in ind]
                y = y_train[ind]
                ses = self.ses_train[ind]
                alpha_mat = spams.lasso(np.asfortranarray(x_mat),
                                        D=np.asfortranarray(self.D),
                                        lambda1=self.lambda1,
                                        lambda2=self.lambda2,
                                        mode=2, pos=self.pos)
                alpha_mean = alpha_mat.toarray()

                if alpha_mean.nonzero()[0].any():
                    # Step decaying heuristic
                    rho_t = np.min(
                        [
                            self.rho,
                            self.rho * (
                                nb_batch*self.n_iter*self.batch_size
                                ) / (
                                10*(
                                    (i*nb_batch*self.batch_size) +
                                    t*self.batch_size + 1))])

                    # Gradient of loss with respect to projections
                    denom = np.zeros((alpha_mat.shape[1], ))
                    num_alpha = np.zeros(alpha_mat.shape)
                    for k in range(self.n_labels):
                        tmp = np.exp(
                            np.dot(self.w[k, :], alpha_mean) + self.b[k])
                        denom += tmp
                        num_alpha += (np.dot(
                            self.w[k, :][:, np.newaxis],
                            tmp[:, np.newaxis].T))
                    d_alpha = (
                        num_alpha.T / denom[:, np.newaxis]) - self.w[y, :]

                    # Update D
                    self.update_D(
                        x_mat=x_mat, y=y, denom=denom,
                        num_alpha=num_alpha, d_alpha=d_alpha,
                        alpha_mat=alpha_mat, rho_t=rho_t, ses=ses)

            if self.verbose > 0:
                print "Iteration time", time.time() - tic
        # Print final performance #

        if self.verbose > 0:  # Print the scores
            print("Final model")
            if X_test.any():
                a, f1 = self.scores(X=X_test, y=y_test)
                print(""" Classification scores on test set:
                          a=%0.3f   f1=%0.3f""" % (a, f1))
            a, f1, = self.scores(X=X_train, y=y_train)
            print(""" Classification scores on train set:
                      a=%0.3f   f1=%0.3f""" % (a, f1))

[docs]    def scores(self, X, y):
        """
        Compute classification scores (accurracy and F1-score).
        on a given dataset.

        Parameters
        ----------
        X_train : array, shape (n_samples, n_features)
            Training data matrix

        X_test : array, shape (n_samples_test, n_features), optional
            Test data matrix
            Usefull only to check performance during development

        y_train : array-like, shape (n_samples/self.agreg,)
            Target vector relative to X_train.

        y_train : array-like, shape (n_samples_test/self.agreg,)
            Target vector relative to X_test.

        Returns
        -------
        y_pred: array, shape(n_samples/agreg,)
        """
        tic = time.time()
        alpha_mat = spams.lasso(
            np.asfortranarray(X.T),
            D=np.asfortranarray(self.D),
            lambda1=self.lambda1,
            lambda2=self.lambda2, mode=2,
            pos=self.pos)

        alpha_mean = self.mean_frames(
            alpha_mat.toarray().T, agreg=self.agreg)
        alpha_mean = preprocessing.scale(
            alpha_mean, with_mean=False)
        y_pred = self.clf.predict(alpha_mean)
        a = accuracy_score(y, y_pred)
        f1 = f1_score(y, y_pred, average='weighted')

        return a, f1

[docs]    def update_D(self, x_mat=0, y=0, alpha_mat=0, denom=0, num_alpha=0,
                 d_alpha=0, rho_t=0.001, ses=0):
        """
        Updates dictionary

        Parameters
        ----------
        x_mat : array shape (n_features, batch_size)
            Input data (batch_size = 1 in the stochastic gradient case)

        y : array shape (batch_size, )
            Labels corresponding to the input data

        alpha_mat : array shape (batch_size, n_components)
            Projections

        d_alpha : array shape (batch_size, n_components)
            Gradient of loss with respect to projections

        denom : array shape (n_components, )
            Gradient denominator

        num_alpha : array (batch_size, n_components)
            Gradient numerator

        rho_t : float
            Learning rate

        ses : array (batch_size, )
            Session labels for the data
            (Leave to 0 if sessions are not used)

        """
        non_zero = alpha_mat.nonzero()
        beta = np.zeros(num_alpha.shape)
        beta_sub = np.zeros(num_alpha.shape)
        n_nan = 0
        tot_loop = 0
        lr_weight = np.zeros((beta.shape[0], ))
        alpha_mat = alpha_mat.toarray()
        alpha_mat_sub = np.zeros(alpha_mat.shape)
        for i in range(num_alpha.shape[1]):
            cls_ind = np.where(
                ((self.cls[:, 0] == y[i]) & (self.cls[:, 1] == ses[i])))[0][0]
            sub_ind = np.arange(cls_ind*self.sub_dict_size,
                                (cls_ind+1)*self.sub_dict_size)
            ind = non_zero[0][non_zero[1] == i]
            lr_weight[sub_ind] += 1
            alpha_mat_sub[sub_ind, i] = alpha_mat[sub_ind, i]
            if sub_ind.shape[0] > 1:
                tot_loop += 1
                beta[ind, i] = np.dot(
                    spams.invSym(
                        np.asfortranarray(
                            np.dot(
                                np.transpose(self.D[:, ind]),
                                self.D[:, ind]) +
                            self.lambda2)),
                    d_alpha[i, ind])
                if np.isnan(np.amax(np.abs(beta[sub_ind, i]))):
                    n_nan += 1
                    beta[ind, i] = np.zeros(beta[sub_ind, i].shape)
                beta_sub[sub_ind, i] = beta[sub_ind, i]
            elif sub_ind.shape[0] == 1:
                tot_loop += 1
                beta[sub_ind, i] = np.dot(
                    1./(np.dot(np.transpose(self.D[:, sub_ind]),
                               self.D[:, sub_ind]) + self.lambda2),
                    d_alpha[i, ind])
                alpha_mat_sub[sub_ind, i] = alpha_mat[sub_ind, i]
        if tot_loop/(n_nan+1) < 2:
            print "Warning nan occurence", n_nan, "total loops", tot_loop
        d_D = np.dot(
            (x_mat-np.dot(self.D, alpha_mat_sub)), np.transpose(beta_sub))
        d_D -= np.dot(np.dot(self.D, beta_sub), np.transpose(alpha_mat_sub))

        sub_ind = np.arange(cls_ind*self.sub_dict_size,
                            (cls_ind+1)*self.sub_dict_size)
        rho_t *= lr_weight/self.batch_size
        self.D = self.D - rho_t * d_D - self.cst

        if self.pos is True:
            self.D[np.where(self.D < 0)] = 0.0000001
        self.D = preprocessing.normalize(self.D, axis=0)


def _multinomial_loss(w, X, Y, alpha, sample_weight):
    """Computes multinomial loss and class probabilities.
    Parameters
    ----------
    w : ndarray, shape (n_classes * n_features,) or
        (n_classes * (n_features + 1),)
        Coefficient vector.
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.
    Y : ndarray, shape (n_samples, n_classes)
        Transformed labels according to the output of LabelBinarizer.
    alpha : float
        Regularization parameter. alpha is equal to 1 / C.
    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.
    Returns
    -------
    loss : float
        Multinomial loss.
    p : ndarray, shape (n_samples, n_classes)
        Estimated class probabilities.
    w : ndarray, shape (n_classes, n_features)
        Reshaped param vector excluding intercept terms.
    """
    n_classes = Y.shape[1]
    n_features = X.shape[1]
    fit_intercept = w.size == (n_classes * (n_features + 1))
    w = w.reshape(n_classes, -1)
    sample_weight = sample_weight[:, np.newaxis]
    if fit_intercept:
        intercept = w[:, -1]
        w = w[:, :-1]
    else:
        intercept = 0
    p = safe_sparse_dot(X, w.T)
    p += intercept
    p -= logsumexp(p, axis=1)[:, np.newaxis]
    loss = -(sample_weight * Y * p).sum()
    loss += 0.5 * alpha * squared_norm(w)
    p = np.exp(p, p)
    return loss, p, w


def _intercept_dot(w, X, y):
    """Computes y * np.dot(X, w).
    It takes into consideration if the intercept should be fit or not.
    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.
    y : ndarray, shape (n_samples,)
        Array of labels.
    """
    c = 0.
    if w.size == X.shape[1] + 1:
        c = w[-1]
        w = w[:-1]

    z = safe_sparse_dot(X, w) + c
    return w, c, y * z