Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • marina.daumas-carneiro/deciphering-non-verbal-behaviours
  • jiayi-megane.zhang/deciphering-non-verbal-behaviours
2 results
Show changes
Commits on Source (16)
File added
# Deciphering non verbal behaviours
### Jupyter notebooks:
[Version 0](https://colab.research.google.com/drive/1Ct4q2mRVBq388Duier8hc3nGZm5XT5ZG?usp=sharing)
[Version 1](https://colab.research.google.com/drive/1oqIoh73WEgPQ0vRVj955mKgK_DJimfjI#scrollTo=0JYi2Ca0hbHr)
[Version 2](https://colab.research.google.com/drive/13LEnSzpID61hKMQHNyKXlRLeMdSUBKJm?usp=sharing#scrollTo=uzvBW6sLYXWt)
import numpy as np
import librosa
import torch
from torch.utils.data import Dataset
import fairseq
class IEMOCAPDataset(Dataset):
def __init__(self, data_root: str,
def __init__(self,
data_root: str,
train: bool = True,
sequence_length: int = 100,
features_name: str = "spec",
session_to_test: int = 5,
from_npy: str = None
from_npy: str = None,
root_path:str = None,
wa2v_weights_path:str = None
):
super().__init__()
if train:
self.iemocap_table = data_root.query(f'session!={session_to_test}')
......@@ -22,17 +28,27 @@ class IEMOCAPDataset(Dataset):
self.sequence_length = sequence_length
self.features_name = features_name
self.from_npy = from_npy
if self.from_npy is not None:
self.all_data = np.load(self.from_npy, allow_pickle=True)
else:
self.root_path = root_path
self.emo_to_int = dict(hap= 0, ang= 1, neu= 2, sad= 3, exc= 0)
# WAV2VEC
if features_name == "wav2vec" and from_npy is None:
cp_path = wa2v_weights_path
self.model_wav2vec, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp_path])
self.model_wav2vec = self.model_wav2vec[0]
self.model_wav2vec.eval()
def __len__(self):
return len(self.table)
@staticmethod
def load_wav(path: str):
""" Load audio """
path = "/content/gdrive/MyDrive/IEMOCAP_full_release_withoutVideos_sentenceOnly/" + path
signal, sr = librosa.load(path)
return signal, sr
......@@ -56,14 +72,18 @@ class IEMOCAPDataset(Dataset):
def spec(self, signal, sample_rate):
X = librosa.stft(signal,
n_fft=1024,
n_fft=1024,
center=False,
hop_length=256,
hop_length=256,
win_length = 1024)
X= np.abs(X)**2
return X
def wav2vec(self, signal, sample_rate):
wav2vec = self.model_wav2vec.feature_extractor(signal)
return wav2vec
@staticmethod
def padding(data, seq_length=50):
......@@ -83,7 +103,7 @@ class IEMOCAPDataset(Dataset):
elif self.features_name.lower() == "melspec": # reprsentation condensee du spectogram
features = self.melspec(signal, sr)
elif self.features_name.lower() == "wav2vec":
features = self.wav2vec(signal, parameters)
features = self.wav2vec(torch.from_numpy(signal)[None],sr)[0].cpu().detach().numpy()
elif self.features_name.lower() == "spec":
features = self.spec(signal, sr)
else:
......@@ -99,7 +119,8 @@ class IEMOCAPDataset(Dataset):
emotion = self.emo_to_int[emotion]
if self.from_npy is None:
audio, sr = self.load_wav(line)
wav_path = self.root_path + "/" + line
audio, sr = self.load_wav(wav_path)
features = self.extract_features(audio, sr).transpose()
else:
features = self.all_data[item]
......
import numpy as np
import librosa
import torch
from torch.utils.data import Dataset
import fairseq
class IEMOCAPDataset(Dataset):
def __init__(self,
data_root: str,
train: bool = True,
sequence_length: int = 100,
features_name: str = "spec",
session_to_test: int = 5,
from_npy: str = None,
root_path:str = None,
wa2v_weights_path:str = None
):
super().__init__()
if train:
self.iemocap_table = data_root.query(f'session!={session_to_test}')
else:
self.iemocap_table = data_root.query(f'session=={session_to_test}')
print(self.iemocap_table)
self.table = self.iemocap_table
self.train = train
self.sequence_length = sequence_length
self.features_name = features_name
self.from_npy = from_npy
if self.from_npy is not None:
self.all_data = np.load(self.from_npy, allow_pickle=True)
else:
self.root_path = root_path
self.emo_to_int = dict(hap= 0, ang= 1, neu= 2, sad= 3, exc= 0)
# WAV2VEC
if features_name == "wav2vec" and from_npy is None:
cp_path = wa2v_weights_path
self.model_wav2vec, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp_path])
self.model_wav2vec = self.model_wav2vec[0]
self.model_wav2vec.eval()
def __len__(self):
return len(self.table)
@staticmethod
def load_wav(path: str):
""" Load audio """
signal, sr = librosa.load(path)
return signal, sr
def melspec(self, signal, sample_rate):
mel = librosa.feature.melspectrogram(y=signal,
sr=sample_rate,
n_mels = 80,
hop_length=512,
win_length = 1024)
return mel
def mfcc(self, signal, sample_rate):
mfcc = librosa.feature.mfcc(signal,
sr=sample_rate,
n_mfcc=15,
n_fft=1024,
hop_length=256,
win_length = 1024)
return mfcc
def spec(self, signal, sample_rate):
X = librosa.stft(signal,
n_fft=1024,
center=False,
hop_length=256,
win_length = 1024)
X= np.abs(X)**2
return X
def wav2vec(self, signal, sample_rate):
wav2vec = self.model_wav2vec.feature_extractor(signal)
return wav2vec
@staticmethod
def padding(data, seq_length=50):
"""
:param seq_length:
:param data:
:return:
"""
if len(data.shape) == 2:
data = np.pad(data, ((0, seq_length - data.shape[0]), (0, 0)), 'wrap')
return data
def extract_features(self, signal, sr):
if self.features_name.lower() == "mfcc": # 15/16 a la place de 80, ça va etre le pire parce que il su pprime le pitch et les emotions sont liées au pitch.
features = self.mfcc(signal, sr)
elif self.features_name.lower() == "melspec": # reprsentation condensee du spectogram
features = self.melspec(signal, sr)
elif self.features_name.lower() == "wav2vec":
features = self.wav2vec(torch.from_numpy(signal)[None],sr)[0].cpu().detach().numpy()
elif self.features_name.lower() == "spec":
features = self.spec(signal, sr)
else:
raise Exception("Sorry, choose only mfcc, melspec, wav2vec, spec")
return features
def __getitem__(self, item):
while True:
line = self.iemocap_table["wav_path"].iloc[item]
emotion = self.iemocap_table["emotion"].iloc[item]
emotion = self.emo_to_int[emotion]
if self.from_npy is None:
wav_path = self.root_path + "/" + line
audio, sr = self.load_wav(wav_path)
features = self.extract_features(audio, sr=16000).transpose()
else:
features = self.all_data[item]
self.number_frames = features.shape[0]
if self.number_frames > self.sequence_length:
break
else:
features = self.padding(features, seq_length=self.sequence_length+1)
self.number_frames = features.shape[0]
break
self.current_frame = np.random.randint(0, self.number_frames - self.sequence_length)
self.out = features[self.current_frame: self.current_frame + self.sequence_length]
return torch.from_numpy(self.out), torch.tensor(emotion)
......@@ -19,10 +19,11 @@ class NetIemocap(nn.Module):
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True) #lstm
self.fc_1 = nn.Linear(hidden_size, 128) #fully connected 1
self.fc = nn.Linear(128, num_classes) #fully connected last layer
self.fc_1 = nn.Linear(hidden_size, 128) # fully connected 1
self.relu = nn.ReLU()
self.fc = nn.Linear(128, num_classes) # fully connected last layer
def forward(self,x):
h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() #hidden state
......
import torch
import torch.nn as nn
from einops import rearrange
from positional_encodings.torch_encodings import PositionalEncoding1D, Summer
class Query2Label(nn.Module):
"""Modified Query2Label model
Unlike the model described in the paper (which uses a modified DETR
transformer), this version uses a standard, unmodified Pytorch Transformer.
Learnable label embeddings are passed to the decoder module as the target
sequence (and ultimately is passed as the Query to MHA).
"""
def __init__(
self, num_classes,
nheads=8,
hidden_dim: int = None,
encoder_layers=1,
decoder_layers=1,
use_pos_encoding=True):
"""Initializes model
Args:
model (str): Timm model descriptor for backbone.
conv_out (int): Backbone output channels.
num_classes (int): Number of possible label classes
hidden_dim (int, optional): Hidden channels from linear projection of
backbone output. Defaults to 256.
nheads (int, optional): Number of MHA heads. Defaults to 8.
encoder_layers (int, optional): Number of encoders. Defaults to 6.
decoder_layers (int, optional): Number of decoders. Defaults to 6.
use_pos_encoding (bool, optional): Flag for use of position encoding.
Defaults to False.
"""
super().__init__()
self.num_classes = num_classes
self.use_pos_encoding = use_pos_encoding
self.hidden_dim = hidden_dim
self.transformer = nn.Transformer(self.hidden_dim, nheads, encoder_layers, decoder_layers)
if self.use_pos_encoding:
# returns the encoding object
self.pos_encoder = PositionalEncoding1D(self.hidden_dim)
# returns the summing object
self.encoding_adder = Summer(self.pos_encoder)
# prediction head
self.classifier = nn.Linear(num_classes * self.hidden_dim, num_classes)
# learnable label embedding
self.label_emb = nn.Parameter(torch.rand(1, num_classes, self.hidden_dim))
def forward(self, features):
"""Passes batch through network
Args:
x (Tensor): Batch of features (melspec, wav2ev ...)
Returns:
Tensor: Output of classification head
"""
# add position encodings
if self.use_pos_encoding:
# input with encoding added
features = self.encoding_adder(features)
features = rearrange(features, 'b t c -> t b c')
B = features.shape[1]
# image feature vector "h" is sent in after transformation above; we
# also convert label_emb from [1 x TARGET x (hidden)EMBED_SIZE] to
# [TARGET x BATCH_SIZE x (hidden)EMBED_SIZE]
label_emb = self.label_emb.repeat(B, 1, 1)
label_emb = label_emb.transpose(0, 1)
h = self.transformer(features, label_emb).transpose(0, 1)
# output from transformer was of dim [TARGET x BATCH_SIZE x EMBED_SIZE];
# however, we transposed it to [BATCH_SIZE x TARGET x EMBED_SIZE] above.
# below we reshape to [BATCH_SIZE x TARGET*EMBED_SIZE].
#
# next, we project transformer outputs to class labels
h = torch.reshape(h, (B, self.num_classes * self.hidden_dim))
return self.classifier(h)
import numpy as np
from tqdm import tqdm
from dataset import IEMOCAPDataset
from dataset import IEMOCAPDataset as IEMOCAPDataset_v1
from dataset_v2 import IEMOCAPDataset as IEMOCAPDataset_v2
def preprocess(preprocessed_path, df, features = None, session_to_test= None, train= None):
x_features_train = IEMOCAPDataset(data_root=df, features_name=features, session_to_test=session_to_test, train=train)
def preprocess(version, preprocessed_path, df, features = None, session_to_test= None, train= None, root_path=None, wa2v_weights_path=None):
if version == 1:
x_features_train = IEMOCAPDataset_v1(data_root=df, features_name=features, session_to_test=session_to_test, train=train, root_path=root_path, wa2v_weights_path=wa2v_weights_path)
elif version == 2:
x_features_train = IEMOCAPDataset_v2(data_root=df, features_name=features, session_to_test=session_to_test, train=train, root_path=root_path, wa2v_weights_path=wa2v_weights_path)
len = x_features_train.__len__()
all_data = []
for i in tqdm(range(len)):
line = x_features_train.iemocap_table["wav_path"].iloc[i]
audio, sr = x_features_train.load_wav(line)
wav_path = root_path + "/" + line
audio, sr = x_features_train.load_wav(wav_path)
data = x_features_train.extract_features(audio, sr).transpose()
all_data.append(data)
np.save(f"{preprocessed_path}/{features}-session_to_test_{session_to_test}-train_{train}.npy", np.array(all_data))
\ No newline at end of file