# Deciphering non verbal behaviours
### Jupyter notebooks:
[Version 0](
[Version 1](
[Version 2](
import numpy as np
import librosa
import torch
from import Dataset
import fairseq
class IEMOCAPDataset(Dataset):
def __init__(self, data_root: str,
def __init__(self,
data_root: str,
train: bool = True,
sequence_length: int = 100,
features_name: str = "spec",
session_to_test: int = 5,
from_npy: str = None
from_npy: str = None,
root_path:str = None,
wa2v_weights_path:str = None
if train:
self.iemocap_table = data_root.query(f'session!={session_to_test}')
......@@ -22,17 +28,27 @@ class IEMOCAPDataset(Dataset):
self.sequence_length = sequence_length
self.features_name = features_name
self.from_npy = from_npy
if self.from_npy is not None:
self.all_data = np.load(self.from_npy, allow_pickle=True)
self.root_path = root_path
self.emo_to_int = dict(hap= 0, ang= 1, neu= 2, sad= 3, exc= 0)
if features_name == "wav2vec" and from_npy is None:
cp_path = wa2v_weights_path
self.model_wav2vec, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp_path])
self.model_wav2vec = self.model_wav2vec[0]
def __len__(self):
return len(self.table)
def load_wav(path: str):
""" Load audio """
path = "/content/gdrive/MyDrive/IEMOCAP_full_release_withoutVideos_sentenceOnly/" + path
signal, sr = librosa.load(path)
return signal, sr
......@@ -56,14 +72,18 @@ class IEMOCAPDataset(Dataset):
def spec(self, signal, sample_rate):
X = librosa.stft(signal,
win_length = 1024)
X= np.abs(X)**2
return X
def wav2vec(self, signal, sample_rate):
wav2vec = self.model_wav2vec.feature_extractor(signal)
return wav2vec
def padding(data, seq_length=50):
......@@ -83,7 +103,7 @@ class IEMOCAPDataset(Dataset):
elif self.features_name.lower() == "melspec": # reprsentation condensee du spectogram
features = self.melspec(signal, sr)
elif self.features_name.lower() == "wav2vec":
features = self.wav2vec(signal, parameters)
features = self.wav2vec(torch.from_numpy(signal)[None],sr)[0].cpu().detach().numpy()
elif self.features_name.lower() == "spec":
features = self.spec(signal, sr)
......@@ -99,7 +119,8 @@ class IEMOCAPDataset(Dataset):
emotion = self.emo_to_int[emotion]
if self.from_npy is None:
audio, sr = self.load_wav(line)
wav_path = self.root_path + "/" + line
audio, sr = self.load_wav(wav_path)
features = self.extract_features(audio, sr).transpose()
features = self.all_data[item]
import numpy as np
import librosa
import torch
from import Dataset
import fairseq
class IEMOCAPDataset(Dataset):
def __init__(self,
data_root: str,
train: bool = True,
sequence_length: int = 100,
features_name: str = "spec",
session_to_test: int = 5,
from_npy: str = None,
root_path:str = None,
wa2v_weights_path:str = None
if train:
self.iemocap_table = data_root.query(f'session!={session_to_test}')
self.iemocap_table = data_root.query(f'session=={session_to_test}')
self.table = self.iemocap_table
self.train = train
self.sequence_length = sequence_length
self.features_name = features_name
self.from_npy = from_npy
if self.from_npy is not None:
self.all_data = np.load(self.from_npy, allow_pickle=True)
self.root_path = root_path
self.emo_to_int = dict(hap= 0, ang= 1, neu= 2, sad= 3, exc= 0)
if features_name == "wav2vec" and from_npy is None:
cp_path = wa2v_weights_path
self.model_wav2vec, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp_path])
self.model_wav2vec = self.model_wav2vec[0]
def __len__(self):
return len(self.table)
def load_wav(path: str):
""" Load audio """
signal, sr = librosa.load(path)
return signal, sr
def melspec(self, signal, sample_rate):
mel = librosa.feature.melspectrogram(y=signal,
n_mels = 80,
win_length = 1024)
return mel
def mfcc(self, signal, sample_rate):
mfcc = librosa.feature.mfcc(signal,
win_length = 1024)
return mfcc
def spec(self, signal, sample_rate):
X = librosa.stft(signal,
win_length = 1024)
X= np.abs(X)**2
return X
def wav2vec(self, signal, sample_rate):
wav2vec = self.model_wav2vec.feature_extractor(signal)
return wav2vec
def padding(data, seq_length=50):
:param seq_length:
:param data:
if len(data.shape) == 2:
data = np.pad(data, ((0, seq_length - data.shape[0]), (0, 0)), 'wrap')
return data
def extract_features(self, signal, sr):
if self.features_name.lower() == "mfcc": # 15/16 a la place de 80, ça va etre le pire parce que il su pprime le pitch et les emotions sont liées au pitch.
features = self.mfcc(signal, sr)
elif self.features_name.lower() == "melspec": # reprsentation condensee du spectogram
features = self.melspec(signal, sr)
elif self.features_name.lower() == "wav2vec":
features = self.wav2vec(torch.from_numpy(signal)[None],sr)[0].cpu().detach().numpy()
elif self.features_name.lower() == "spec":
features = self.spec(signal, sr)
raise Exception("Sorry, choose only mfcc, melspec, wav2vec, spec")
return features
def __getitem__(self, item):
while True:
line = self.iemocap_table["wav_path"].iloc[item]
emotion = self.iemocap_table["emotion"].iloc[item]
emotion = self.emo_to_int[emotion]
if self.from_npy is None:
wav_path = self.root_path + "/" + line
audio, sr = self.load_wav(wav_path)
features = self.extract_features(audio, sr=16000).transpose()
features = self.all_data[item]
self.number_frames = features.shape[0]
if self.number_frames > self.sequence_length:
features = self.padding(features, seq_length=self.sequence_length+1)
self.number_frames = features.shape[0]
self.current_frame = np.random.randint(0, self.number_frames - self.sequence_length)
self.out = features[self.current_frame: self.current_frame + self.sequence_length]
return torch.from_numpy(self.out), torch.tensor(emotion)
......@@ -19,10 +19,11 @@ class NetIemocap(nn.Module):
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True) #lstm
self.fc_1 = nn.Linear(hidden_size, 128) #fully connected 1
self.fc = nn.Linear(128, num_classes) #fully connected last layer
self.fc_1 = nn.Linear(hidden_size, 128) # fully connected 1
self.relu = nn.ReLU()
self.fc = nn.Linear(128, num_classes) # fully connected last layer
def forward(self,x):
h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() #hidden state
import torch
import torch.nn as nn
from einops import rearrange
from positional_encodings.torch_encodings import PositionalEncoding1D, Summer
class Query2Label(nn.Module):
"""Modified Query2Label model
Unlike the model described in the paper (which uses a modified DETR
transformer), this version uses a standard, unmodified Pytorch Transformer.
Learnable label embeddings are passed to the decoder module as the target
sequence (and ultimately is passed as the Query to MHA).
def __init__(
self, num_classes,
hidden_dim: int = None,
"""Initializes model
model (str): Timm model descriptor for backbone.
conv_out (int): Backbone output channels.
num_classes (int): Number of possible label classes
hidden_dim (int, optional): Hidden channels from linear projection of
backbone output. Defaults to 256.
nheads (int, optional): Number of MHA heads. Defaults to 8.
encoder_layers (int, optional): Number of encoders. Defaults to 6.
decoder_layers (int, optional): Number of decoders. Defaults to 6.
use_pos_encoding (bool, optional): Flag for use of position encoding.
Defaults to False.
self.num_classes = num_classes
self.use_pos_encoding = use_pos_encoding
self.hidden_dim = hidden_dim
self.transformer = nn.Transformer(self.hidden_dim, nheads, encoder_layers, decoder_layers)
if self.use_pos_encoding:
# returns the encoding object
self.pos_encoder = PositionalEncoding1D(self.hidden_dim)
# returns the summing object
self.encoding_adder = Summer(self.pos_encoder)
# prediction head
self.classifier = nn.Linear(num_classes * self.hidden_dim, num_classes)
# learnable label embedding
self.label_emb = nn.Parameter(torch.rand(1, num_classes, self.hidden_dim))
def forward(self, features):
"""Passes batch through network
x (Tensor): Batch of features (melspec, wav2ev ...)
Tensor: Output of classification head
# add position encodings
if self.use_pos_encoding:
# input with encoding added
features = self.encoding_adder(features)
features = rearrange(features, 'b t c -> t b c')
B = features.shape[1]
# image feature vector "h" is sent in after transformation above; we
# also convert label_emb from [1 x TARGET x (hidden)EMBED_SIZE] to
label_emb = self.label_emb.repeat(B, 1, 1)
label_emb = label_emb.transpose(0, 1)
h = self.transformer(features, label_emb).transpose(0, 1)
# output from transformer was of dim [TARGET x BATCH_SIZE x EMBED_SIZE];
# however, we transposed it to [BATCH_SIZE x TARGET x EMBED_SIZE] above.
# below we reshape to [BATCH_SIZE x TARGET*EMBED_SIZE].
# next, we project transformer outputs to class labels
h = torch.reshape(h, (B, self.num_classes * self.hidden_dim))
return self.classifier(h)
import numpy as np
from tqdm import tqdm
from dataset import IEMOCAPDataset
from dataset import IEMOCAPDataset as IEMOCAPDataset_v1
from dataset_v2 import IEMOCAPDataset as IEMOCAPDataset_v2
def preprocess(preprocessed_path, df, features = None, session_to_test= None, train= None):
x_features_train = IEMOCAPDataset(data_root=df, features_name=features, session_to_test=session_to_test, train=train)
def preprocess(version, preprocessed_path, df, features = None, session_to_test= None, train= None, root_path=None, wa2v_weights_path=None):
if version == 1:
x_features_train = IEMOCAPDataset_v1(data_root=df, features_name=features, session_to_test=session_to_test, train=train, root_path=root_path, wa2v_weights_path=wa2v_weights_path)
elif version == 2:
x_features_train = IEMOCAPDataset_v2(data_root=df, features_name=features, session_to_test=session_to_test, train=train, root_path=root_path, wa2v_weights_path=wa2v_weights_path)
len = x_features_train.__len__()
all_data = []
for i in tqdm(range(len)):
line = x_features_train.iemocap_table["wav_path"].iloc[i]
audio, sr = x_features_train.load_wav(line)
wav_path = root_path + "/" + line
audio, sr = x_features_train.load_wav(wav_path)
data = x_features_train.extract_features(audio, sr).transpose()
all_data.append(data)"{preprocessed_path}/{features}-session_to_test_{session_to_test}-train_{train}.npy", np.array(all_data))
\ No newline at end of file