# -*- coding: utf-8 -*-
"""
supervised_dl_class.py
~~~~~~~~~~~~~~~~~~~~~~
.. topic:: Contents
The supervised_dl_class module include the SupervisedDL class,
fit and score function
Created on Wed Jun 29 16:37:28 2016
@authors: bisot, serizel
.. [#] V.Bisot, R. Serizel, S. Essid, and G. Richard.
"Feature Learning with Matrix Factorization Applied to
Acoustic Scene Classification".
Accepted for publication in *IEEE Transactions on Audio,
Speech and Language Processing*, 2017
.. [#] R. Serizel, V.Bisot, S. Essid, and G. Richard.
“Supervised group nonnegative matrix factorisation with similarity
constraints and applications to speaker identification”.
In Proc. of *2017 IEEE International Conference on Acoustics,
Speech and Signal Processing (ICASSP)*, 2017.
"""
import numpy as np
import spams
from sklearn import decomposition
import beta_ntf
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
import random
import time
import copy
from sklearn.utils.extmath import (
logsumexp, log_logistic, safe_sparse_dot,
softmax, squared_norm)
from sklearn.utils.validation import (
DataConversionWarning,
check_X_y, NotFittedError)
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
[docs]class SupervisedDL(object):
""" Supervised DL class
Task-driven Dictionary Learning with modified algorithm
Parameters
----------
data : array, shape (n_samples, n_features)
Training data matrix
Needs to be provided if initialization is done in the model
n_components : int
Size of the dictionary
n_labels : int
number of classes
pos : bool, default: True
When set to True, the model is fit in its nonnegative formulation
n_iter : int
Number of epochs on which to run the algorithm
lambda1 : float, default: 0.1
Paramter controlling the l1 norm penality for the projection step
lambda2 : float, default: 0
Paramter controlling the l2 norm penality for the projection step
rho : float, default: 0.001
Initial gradient step parameter
mu : float, default: 1
Regularization strenght of the classifier; must be positive.
Smaller values specify stronger regularization.
agreg : int, optional (default: 1)
In the case classification is done on bags of agreg successive
projections. Every successive group of agreg projections
(without overlapp) will be averaged before classification
init : str, {'random', 'nmf', 'dic-learning'}
Controls the nature of the dictionary initialization.
* Use 'random' for a random initalization of
* Use 'nmf' for intializaing with nonnegative matrix factorization
* Use 'dic-learning' to intilaize with the scikit-learn DictionaryLearning class
n_iter_init : int, optional
Number of iterations for the dictionary initialization
max_iter_init : int, optional
Maximum number of iterations for the classifier initialization
max_iter_inloop : int, optional
Maximum number of iterations at each epoch for the classifier update
batch_size : int, optional
Size of the batch (1 for stochastic gradient)
cls : array (n_samples, 1)
Class labels for the data
ses_train : array (n_samples, 1)
Session labels for the data
(Leave to 0 if sessions are not used)
sub_dict_size : int
Size of the sub-dictionnaries related to a unique couple (cls, ses)
See also [3]_
k_cls : int
Number of components that are affected to cls related bases.
See also [3]_
k_ses : int
Number of components that are affected to ses related bases.
See also [3]_
nu1 : float
Class similarity constraint
See also [3]_
nu2 : float
Session similarity constraint
See also [3]_
verbose : int
Set verbose to any positive number for verbosity.
Attributes
----------
clf : object
Classifier
D : array
Dictionnary
dist_ses : array
Distance between bases related to the same session
dist_cls :
Distance between bases related to the same class
cst : array
Update constraint computed from dist_ses and dist_cls
References
----------
.. [#] R. Serizel, S. Essid, and G. Richard.
“Group nonnegative matrix factorisation with speaker and session
variability compensation for speaker identification”.
In Proc. of *2016 IEEE International Conference on Acoustics,
Speech and Signal Processing (ICASSP)*, pp. 5470-5474, 2016.
"""
def __init__(self, data=np.asarray([[0, 0]]), n_components=64,
n_labels=2, pos=True, n_iter=1,
lambda1=0, lambda2=0, rho=0.001, verbose=0, mu=1, agreg=1,
init='random', n_iter_init=10, batch_size=6250,
max_iter_init=10, max_iter_inloop=1,
cls=np.asarray([[0, 0]]), ses_train=0,
sub_dict_size=1, k_cls=0, k_ses=0, nu1=0, nu2=0):
self.data = data
self.data_shape = data.shape
self.n_components = n_components
self.verbose = verbose
self.lambda1 = lambda1
self.lambda2 = lambda2
self.n_iter = n_iter
self.n_labels = n_labels
self.init = init
self.rho = rho
self.mu = mu
self.agreg = agreg
self.pos = pos
self.analysis = np.zeros(shape=(n_iter, 5))
self.batch_size = batch_size
self.max_iter_init = max_iter_init
self.max_iter_inloop = max_iter_inloop
self.clf = LogisticRegression(
C=self.mu, multi_class='multinomial',
solver='lbfgs', max_iter=self.max_iter_init, warm_start=True)
self.cls = cls
self.sub_dict_size = sub_dict_size
self.k_cls = k_cls
self.k_ses = k_ses
self.nu1 = nu1
self.nu2 = nu2
self.ses_train = ses_train
if self.init == 'random':
if self.pos is False:
self.D = np.random.randn(self.data_shape[1],
self.n_components)
if self.pos is True:
self.D = np.abs(np.random.randn(self.data_shape[1],
self.n_components))
self.D = preprocessing.normalize(self.D, axis=0)
if self.init == 'NMF':
if self.verbose > 0:
print("Initializing dictionnary with beta_ntf")
ntf = beta_ntf.BetaNTF(data_shape=self.data_shape,
n_components=self.n_components,
n_iter=n_iter_init, verbose=False, beta=2)
ntf.fit(self.data)
self.D = ntf.factors_[1]
self.D = preprocessing.normalize(self.D, axis=0)
if self.init == 'DictionnaryLearning':
if self.verbose > 0:
print(""" Initializing dictionnary
with sklearn DictionnaryLearning""")
u_dl = decomposition.DictionaryLearning(
n_components=self.n_components,
alpha=0, max_iter=n_iter_init)
u_dl.fit(self.data)
self.D = preprocessing.normalize(
u_dl.components_.T, axis=0)
self.compute_cst()
[docs] def compute_cst(self):
""" Compute the update constraints based on the distance between
session related bases and the distance between class related bases"""
cls = self.cls
cls_card = np.zeros((int(np.amax(self.cls[:, 0])), ))
cls_sum = np.zeros((int(np.amax(self.cls[:, 0])),
self.D.shape[0], self.k_cls))
self.dist_cls = 0
for i in range(len(cls_card)):
cls_card[i] = max(cls[cls[:, 0] == i].shape[0] - 1, 0)
ref_sub_ind = np.arange(i*self.sub_dict_size,
i*self.sub_dict_size + self.k_cls)
for j in np.where(cls[cls[:, 0] == i])[0]:
sub_ind = np.arange(j*self.sub_dict_size,
j*self.sub_dict_size + self.k_cls)
cls_sum[i, ] += self.D[:, sub_ind]
self.dist_cls += np.linalg.norm(
self.D[:, sub_ind] - self.D[:, ref_sub_ind])
ses_card = np.zeros((int(np.amax(self.cls[:, 1])), ))
ses_sum = np.zeros((int(np.amax(self.cls[:, 1])),
self.D.shape[0], self.k_ses))
self.dist_ses = 0
for i in range(len(ses_card)):
ses_card[i] = max(cls[cls[:, 1] == i].shape[0] - 1, 0)
ref_sub_ind = np.arange(
i*self.sub_dict_size + self.k_cls,
i*self.sub_dict_size + self.k_cls + self.k_ses)
for j in np.where(cls[cls[:, 1] == i])[0]:
sub_ind = np.arange(
j*self.sub_dict_size + self.k_cls,
j*self.sub_dict_size + self.k_cls + self.k_ses)
ses_sum[i, ] += self.D[:, sub_ind]
self.dist_ses += np.linalg.norm(
self.D[:, sub_ind] - self.D[:, ref_sub_ind])
self.cst = np.zeros((self.D.shape))
for i in range(self.cls.shape[0]):
cls = int(self.cls[i, 0]-1)
ses = int(self.cls[i, 1]-1)
sub_ind = np.arange(i*self.sub_dict_size,
(i+1)*self.sub_dict_size)
D_sub = self.D[:, sub_ind]
sub_cst = np.zeros((self.D.shape[0], self.sub_dict_size))
sub_cst[:, :self.k_cls] = self.nu1 * (
cls_card[cls] * D_sub[:, :self.k_cls] -
(cls_sum[cls, ] - D_sub[:, :self.k_cls]))
sub_cst[:, self.k_cls: self.k_cls+self.k_ses] = self.nu2 * (
ses_card[ses] *
D_sub[:, :self.k_cls:self.k_cls+self.k_ses] -
(ses_sum[ses] - D_sub[:, :self.k_cls:self.k_cls+self.k_ses]))
self.cst[:, sub_ind] = sub_cst
[docs] def mean_frames(self, X=0, agreg=15):
"""
Averages every successive group of agreg rows in the a matrix
Parameters
----------
X : array, shape (n_samples, n_features)
Matrix to reduce by averaging
agreg : int
Specifies size of the groups to average
Returns
-------
X_mean: array, shape(n_samples/agreg, n_features)
Averaged matrix
"""
return np.mean(np.reshape(X, (X.shape[0]/agreg, agreg, -1)), axis=1)
[docs] def project_data(self, X, agreg=1):
"""
Projects data on the model dictionary
Parameters
----------
X : array, shape (n_samples, n_features)
Matrix to project on dictoonary
agreg : int
Specifies size of the groups to average after projection
Returns
-------
projections: array, shape(n_samples/agreg, n_components)
Projection matrix
"""
alpha_mat = spams.lasso(np.asfortranarray(np.transpose(X)),
D=np.asfortranarray(self.D),
lambda1=self.lambda1,
lambda2=self.lambda2, mode=2,
pos=self.pos)
alpha_mat = alpha_mat.toarray()
if agreg > 1:
return np.mean(
np.reshape(
alpha_mat,
(alpha_mat.shape[0]/agreg, agreg, -1)),
axis=1)
else:
return alpha_mat
[docs] def predict(self, X):
"""
Predicts labels from a given matrix using the model classifier
Parameters
----------
X : array, shape (n_samples, n_features)
Matrix to project on dictoonary
Returns
-------
y_pred: array, shape(n_samples/agreg,)
Predicted labels
"""
alpha_mat = self.project_data(X=X, agreg=self.agreg)
y_pred = self.clf.predict(alpha_mat)
return y_pred
[docs] def fit(self, X_train, y_train, X_test=np.array([]), y_test=np.array([])):
"""
Fits the model to input training data
Parameters
----------
X_train : array, shape (n_samples, n_features)
Training data matrix
X_test : array, shape (n_samples_test, n_features), optional
Test data matrix
Usefull only to check performance during development
y_train : array-like, shape (n_samples/self.agreg,)
Target vector relative to X_train.
y_train : array-like, shape (n_samples_test/self.agreg,)
Target vector relative to X_test.
Returns
-------
"""
for i in range(self.n_iter):
# Classifier update step #
tic = time.time()
# Project full data on D
alpha_mat = spams.lasso(
np.asfortranarray(X_train.T),
D=np.asfortranarray(self.D),
lambda1=self.lambda1, lambda2=self.lambda2,
mode=2, pos=self.pos)
# Average projections if necessary
alpha_mean = self.mean_frames(
alpha_mat.toarray().T, agreg=self.agreg)
alpha_mean = preprocessing.scale(alpha_mean, with_mean=False)
# Update classifier
self.clf.fit(alpha_mean, y_train)
self.w = self.clf.coef_
self.b = self.clf.intercept_
self. compute_cst()
if i == 0:
# Classifier is initialized on first iteration
# For further iterations, the LR class is only updated on
# 1 iteration with warm restart
self.clf.max_iter = self.max_iter_inloop
# Print current performance #
if self.verbose > 0: # Print the scores
print("Iteration number %i \n" % i)
X2, y = check_X_y(
alpha_mean, y_train, accept_sparse='csr',
dtype=np.float64, order="C")
lbin = LabelBinarizer()
Y_binarized = lbin.fit_transform(y_train)
yo = np.zeros(shape=(self.n_labels, 1))
yo[:, 0] = self.clf.intercept_
yo = np.concatenate((self.clf.coef_, yo), axis=1)
if Y_binarized.shape[1] == 1:
Y_binarized = np.hstack([1 - Y_binarized, Y_binarized])
ut = _multinomial_loss(
yo, X2, Y_binarized, 1/self.mu,
sample_weight=np.ones(y.shape[0]))
yo = np.zeros(shape=(self.n_labels, 1))
print "Classification costs: ", ut[0]
print(
"Class distance: ", self.dist_cls,
"Session distance:", self.dist_ses)
if i == 0:
self.nu1 *= ut[0]/self.dist_cls
self.nu2 *= ut[0]/self.dist_ses
print "Weights nu1 and nu2", self.nu1, self.nu2
self.compute_cst()
if X_test.any():
a, f1 = self.scores(X=X_test, y=y_test)
print(""" Classification scores on test set:
a=%0.3f f1=%0.3f""" % (a, f1))
a, f1 = self.scores(X=X_train, y=y_train)
print(""" Classification scores on train set:
a=%0.3f f1=%0.3f""" % (a, f1))
# Dictionary update step #
draw = range(self.agreg*len(y_train))
random.shuffle(draw)
nb_batch = len(draw)/int(self.batch_size) + 1
for t in range(nb_batch):
# Select and project a data point
ind = draw[
t*self.batch_size: min((t+1)*self.batch_size, len(draw))]
x_mat = np.transpose(X_train[ind, ])
ind = [x/self.agreg for x in ind]
y = y_train[ind]
ses = self.ses_train[ind]
alpha_mat = spams.lasso(np.asfortranarray(x_mat),
D=np.asfortranarray(self.D),
lambda1=self.lambda1,
lambda2=self.lambda2,
mode=2, pos=self.pos)
alpha_mean = alpha_mat.toarray()
if alpha_mean.nonzero()[0].any():
# Step decaying heuristic
rho_t = np.min(
[
self.rho,
self.rho * (
nb_batch*self.n_iter*self.batch_size
) / (
10*(
(i*nb_batch*self.batch_size) +
t*self.batch_size + 1))])
# Gradient of loss with respect to projections
denom = np.zeros((alpha_mat.shape[1], ))
num_alpha = np.zeros(alpha_mat.shape)
for k in range(self.n_labels):
tmp = np.exp(
np.dot(self.w[k, :], alpha_mean) + self.b[k])
denom += tmp
num_alpha += (np.dot(
self.w[k, :][:, np.newaxis],
tmp[:, np.newaxis].T))
d_alpha = (
num_alpha.T / denom[:, np.newaxis]) - self.w[y, :]
# Update D
self.update_D(
x_mat=x_mat, y=y, denom=denom,
num_alpha=num_alpha, d_alpha=d_alpha,
alpha_mat=alpha_mat, rho_t=rho_t, ses=ses)
if self.verbose > 0:
print "Iteration time", time.time() - tic
# Print final performance #
if self.verbose > 0: # Print the scores
print("Final model")
if X_test.any():
a, f1 = self.scores(X=X_test, y=y_test)
print(""" Classification scores on test set:
a=%0.3f f1=%0.3f""" % (a, f1))
a, f1, = self.scores(X=X_train, y=y_train)
print(""" Classification scores on train set:
a=%0.3f f1=%0.3f""" % (a, f1))
[docs] def scores(self, X, y):
"""
Compute classification scores (accurracy and F1-score).
on a given dataset.
Parameters
----------
X_train : array, shape (n_samples, n_features)
Training data matrix
X_test : array, shape (n_samples_test, n_features), optional
Test data matrix
Usefull only to check performance during development
y_train : array-like, shape (n_samples/self.agreg,)
Target vector relative to X_train.
y_train : array-like, shape (n_samples_test/self.agreg,)
Target vector relative to X_test.
Returns
-------
y_pred: array, shape(n_samples/agreg,)
"""
tic = time.time()
alpha_mat = spams.lasso(
np.asfortranarray(X.T),
D=np.asfortranarray(self.D),
lambda1=self.lambda1,
lambda2=self.lambda2, mode=2,
pos=self.pos)
alpha_mean = self.mean_frames(
alpha_mat.toarray().T, agreg=self.agreg)
alpha_mean = preprocessing.scale(
alpha_mean, with_mean=False)
y_pred = self.clf.predict(alpha_mean)
a = accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred, average='weighted')
return a, f1
[docs] def update_D(self, x_mat=0, y=0, alpha_mat=0, denom=0, num_alpha=0,
d_alpha=0, rho_t=0.001, ses=0):
"""
Updates dictionary
Parameters
----------
x_mat : array shape (n_features, batch_size)
Input data (batch_size = 1 in the stochastic gradient case)
y : array shape (batch_size, )
Labels corresponding to the input data
alpha_mat : array shape (batch_size, n_components)
Projections
d_alpha : array shape (batch_size, n_components)
Gradient of loss with respect to projections
denom : array shape (n_components, )
Gradient denominator
num_alpha : array (batch_size, n_components)
Gradient numerator
rho_t : float
Learning rate
ses : array (batch_size, )
Session labels for the data
(Leave to 0 if sessions are not used)
"""
non_zero = alpha_mat.nonzero()
beta = np.zeros(num_alpha.shape)
beta_sub = np.zeros(num_alpha.shape)
n_nan = 0
tot_loop = 0
lr_weight = np.zeros((beta.shape[0], ))
alpha_mat = alpha_mat.toarray()
alpha_mat_sub = np.zeros(alpha_mat.shape)
for i in range(num_alpha.shape[1]):
cls_ind = np.where(
((self.cls[:, 0] == y[i]) & (self.cls[:, 1] == ses[i])))[0][0]
sub_ind = np.arange(cls_ind*self.sub_dict_size,
(cls_ind+1)*self.sub_dict_size)
ind = non_zero[0][non_zero[1] == i]
lr_weight[sub_ind] += 1
alpha_mat_sub[sub_ind, i] = alpha_mat[sub_ind, i]
if sub_ind.shape[0] > 1:
tot_loop += 1
beta[ind, i] = np.dot(
spams.invSym(
np.asfortranarray(
np.dot(
np.transpose(self.D[:, ind]),
self.D[:, ind]) +
self.lambda2)),
d_alpha[i, ind])
if np.isnan(np.amax(np.abs(beta[sub_ind, i]))):
n_nan += 1
beta[ind, i] = np.zeros(beta[sub_ind, i].shape)
beta_sub[sub_ind, i] = beta[sub_ind, i]
elif sub_ind.shape[0] == 1:
tot_loop += 1
beta[sub_ind, i] = np.dot(
1./(np.dot(np.transpose(self.D[:, sub_ind]),
self.D[:, sub_ind]) + self.lambda2),
d_alpha[i, ind])
alpha_mat_sub[sub_ind, i] = alpha_mat[sub_ind, i]
if tot_loop/(n_nan+1) < 2:
print "Warning nan occurence", n_nan, "total loops", tot_loop
d_D = np.dot(
(x_mat-np.dot(self.D, alpha_mat_sub)), np.transpose(beta_sub))
d_D -= np.dot(np.dot(self.D, beta_sub), np.transpose(alpha_mat_sub))
sub_ind = np.arange(cls_ind*self.sub_dict_size,
(cls_ind+1)*self.sub_dict_size)
rho_t *= lr_weight/self.batch_size
self.D = self.D - rho_t * d_D - self.cst
if self.pos is True:
self.D[np.where(self.D < 0)] = 0.0000001
self.D = preprocessing.normalize(self.D, axis=0)
def _multinomial_loss(w, X, Y, alpha, sample_weight):
"""Computes multinomial loss and class probabilities.
Parameters
----------
w : ndarray, shape (n_classes * n_features,) or
(n_classes * (n_features + 1),)
Coefficient vector.
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training data.
Y : ndarray, shape (n_samples, n_classes)
Transformed labels according to the output of LabelBinarizer.
alpha : float
Regularization parameter. alpha is equal to 1 / C.
sample_weight : array-like, shape (n_samples,) optional
Array of weights that are assigned to individual samples.
If not provided, then each sample is given unit weight.
Returns
-------
loss : float
Multinomial loss.
p : ndarray, shape (n_samples, n_classes)
Estimated class probabilities.
w : ndarray, shape (n_classes, n_features)
Reshaped param vector excluding intercept terms.
"""
n_classes = Y.shape[1]
n_features = X.shape[1]
fit_intercept = w.size == (n_classes * (n_features + 1))
w = w.reshape(n_classes, -1)
sample_weight = sample_weight[:, np.newaxis]
if fit_intercept:
intercept = w[:, -1]
w = w[:, :-1]
else:
intercept = 0
p = safe_sparse_dot(X, w.T)
p += intercept
p -= logsumexp(p, axis=1)[:, np.newaxis]
loss = -(sample_weight * Y * p).sum()
loss += 0.5 * alpha * squared_norm(w)
p = np.exp(p, p)
return loss, p, w
def _intercept_dot(w, X, y):
"""Computes y * np.dot(X, w).
It takes into consideration if the intercept should be fit or not.
Parameters
----------
w : ndarray, shape (n_features,) or (n_features + 1,)
Coefficient vector.
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training data.
y : ndarray, shape (n_samples,)
Array of labels.
"""
c = 0.
if w.size == X.shape[1] + 1:
c = w[-1]
w = w[:-1]
z = safe_sparse_dot(X, w) + c
return w, c, y * z