Compare revisions

Daumas Carneiro Marina · Daumas Carneiro Marina · Daumas Carneiro Marina · Daumas Carneiro Marina · Daumas Carneiro Marina · Daumas Carneiro Marina
--- a/Decyphering_non_verbal_behaviours___ST7_Project.pdf
+++ b/Decyphering_non_verbal_behaviours___ST7_Project.pdf
--- a/README.md
+++ b/README.md
 # Deciphering non verbal behaviours
+
+### Jupyter notebooks:
+
+[Version 0](https://colab.research.google.com/drive/1Ct4q2mRVBq388Duier8hc3nGZm5XT5ZG?usp=sharing)
+
+[Version 1](https://colab.research.google.com/drive/1oqIoh73WEgPQ0vRVj955mKgK_DJimfjI#scrollTo=0JYi2Ca0hbHr)
+
+[Version 2](https://colab.research.google.com/drive/13LEnSzpID61hKMQHNyKXlRLeMdSUBKJm?usp=sharing#scrollTo=uzvBW6sLYXWt)
--- a/dataset.py
+++ b/dataset.py
 import numpy as np
 import librosa 
+import torch
 from torch.utils.data import Dataset
+import fairseq


 class IEMOCAPDataset(Dataset):
-    def __init__(self, data_root: str,
+    def __init__(self, 
+                 data_root: str,
                 train: bool = True,
                 sequence_length: int = 100,
                 features_name: str = "spec",
                 session_to_test: int = 5,
-                 from_npy: str = None
+                 from_npy: str = None,
+                 root_path:str = None,
+                 wa2v_weights_path:str = None
                 ):
+        
        super().__init__()
        if train:
          self.iemocap_table = data_root.query(f'session!={session_to_test}')
@@ -22,17 +28,27 @@ class IEMOCAPDataset(Dataset):
        self.sequence_length = sequence_length
        self.features_name = features_name
        self.from_npy = from_npy
+        
        if self.from_npy is not None:
          self.all_data = np.load(self.from_npy, allow_pickle=True) 
+        else:
+            self.root_path = root_path
+            
        self.emo_to_int = dict(hap= 0, ang= 1, neu= 2, sad= 3, exc= 0)

+        # WAV2VEC
+        if features_name == "wav2vec" and from_npy is None:
+            cp_path = wa2v_weights_path
+            self.model_wav2vec, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp_path])
+            self.model_wav2vec = self.model_wav2vec[0]
+            self.model_wav2vec.eval()
+
    def __len__(self):
        return len(self.table)

    @staticmethod
    def load_wav(path: str):
        """ Load audio  """
-        path = "/content/gdrive/MyDrive/IEMOCAP_full_release_withoutVideos_sentenceOnly/" + path
        signal, sr = librosa.load(path)
        return signal, sr

@@ -56,14 +72,18 @@ class IEMOCAPDataset(Dataset):

    def spec(self, signal, sample_rate): 
        X = librosa.stft(signal, 
-                         n_fft=1024,
+                        n_fft=1024,
                        center=False,
-                         hop_length=256,
+                        hop_length=256,
                        win_length = 1024)
        
        X= np.abs(X)**2
        return X

+    def wav2vec(self, signal, sample_rate):
+        wav2vec = self.model_wav2vec.feature_extractor(signal)
+        return wav2vec
+    

    @staticmethod
    def padding(data, seq_length=50):
@@ -83,7 +103,7 @@ class IEMOCAPDataset(Dataset):
        elif self.features_name.lower() == "melspec": # reprsentation condensee du spectogram
            features = self.melspec(signal, sr)
        elif self.features_name.lower() == "wav2vec": 
-            features = self.wav2vec(signal, parameters)
+            features = self.wav2vec(torch.from_numpy(signal)[None],sr)[0].cpu().detach().numpy()
        elif self.features_name.lower() == "spec": 
            features = self.spec(signal, sr)  
        else:
@@ -99,7 +119,8 @@ class IEMOCAPDataset(Dataset):
            emotion = self.emo_to_int[emotion]

            if self.from_npy is None:
-              audio, sr = self.load_wav(line)
+              wav_path = self.root_path + "/" + line
+              audio, sr = self.load_wav(wav_path) 
              features = self.extract_features(audio, sr).transpose()
            else:
              features = self.all_data[item]

--- a/dataset_v2.py
+++ b/dataset_v2.py
+import numpy as np
+import librosa 
+import torch
+from torch.utils.data import Dataset
+import fairseq
+
+
+class IEMOCAPDataset(Dataset):
+    def __init__(self, 
+                 data_root: str,
+                 train: bool = True,
+                 sequence_length: int = 100,
+                 features_name: str = "spec",
+                 session_to_test: int = 5,
+                 from_npy: str = None,
+                 root_path:str = None,
+                 wa2v_weights_path:str = None
+                 ):
+        
+        super().__init__()
+        if train:
+          self.iemocap_table = data_root.query(f'session!={session_to_test}')
+        else:
+          self.iemocap_table = data_root.query(f'session=={session_to_test}')
+        print(self.iemocap_table)
+        self.table = self.iemocap_table
+        self.train = train
+        self.sequence_length = sequence_length
+        self.features_name = features_name
+        self.from_npy = from_npy
+
+        if self.from_npy is not None:
+          self.all_data = np.load(self.from_npy, allow_pickle=True) 
+        else:
+           self.root_path = root_path
+        self.emo_to_int = dict(hap= 0, ang= 1, neu= 2, sad= 3, exc= 0)
+
+        # WAV2VEC
+        if features_name == "wav2vec" and from_npy is None:
+            cp_path = wa2v_weights_path
+            self.model_wav2vec, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp_path])
+            self.model_wav2vec = self.model_wav2vec[0]
+            self.model_wav2vec.eval()
+
+    def __len__(self):
+        return len(self.table)
+
+    @staticmethod
+    def load_wav(path: str):
+        """ Load audio  """
+        signal, sr = librosa.load(path)
+        return signal, sr
+
+    def melspec(self, signal, sample_rate):
+        mel = librosa.feature.melspectrogram(y=signal, 
+                                             sr=sample_rate, 
+                                             n_mels = 80,
+                                             hop_length=512, 
+                                             win_length = 1024)
+        return mel
+
+    def mfcc(self, signal, sample_rate):
+        mfcc = librosa.feature.mfcc(signal,
+                                    sr=sample_rate, 
+                                    n_mfcc=15, 
+                                    n_fft=1024, 
+                                    hop_length=256, 
+                                    win_length = 1024)
+        return mfcc      
+
+
+    def spec(self, signal, sample_rate): 
+        X = librosa.stft(signal, 
+                         n_fft=1024,
+                        center=False,
+                         hop_length=256,
+                        win_length = 1024)
+        
+        X= np.abs(X)**2
+        return X
+
+    def wav2vec(self, signal, sample_rate):
+      wav2vec = self.model_wav2vec.feature_extractor(signal)
+      return wav2vec
+
+
+    @staticmethod
+    def padding(data, seq_length=50):
+        """
+        :param seq_length:
+        :param data:
+        :return:
+        """
+        if len(data.shape) == 2:
+            data = np.pad(data, ((0, seq_length - data.shape[0]), (0, 0)), 'wrap')
+        return data
+
+
+    def extract_features(self, signal, sr):
+        if self.features_name.lower() == "mfcc": # 15/16 a la place de 80, ça va etre le pire parce que il su pprime le pitch et les emotions sont liées au pitch.
+            features = self.mfcc(signal, sr)
+        elif self.features_name.lower() == "melspec": # reprsentation condensee du spectogram
+            features = self.melspec(signal, sr)
+        elif self.features_name.lower() == "wav2vec": 
+            features = self.wav2vec(torch.from_numpy(signal)[None],sr)[0].cpu().detach().numpy()
+        elif self.features_name.lower() == "spec": 
+            features = self.spec(signal, sr)  
+        else:
+          raise Exception("Sorry, choose only mfcc, melspec, wav2vec, spec")  
+
+        return features   
+
+
+    def __getitem__(self, item):
+        while True:
+            line = self.iemocap_table["wav_path"].iloc[item]
+            emotion = self.iemocap_table["emotion"].iloc[item]
+            emotion = self.emo_to_int[emotion]
+            
+            if self.from_npy is None:
+              wav_path = self.root_path + "/" + line
+              audio, sr = self.load_wav(wav_path) 
+              features = self.extract_features(audio, sr=16000).transpose()
+            else:
+              features = self.all_data[item]
+              
+            self.number_frames = features.shape[0]
+            if self.number_frames > self.sequence_length:
+                break
+            else:
+                features = self.padding(features, seq_length=self.sequence_length+1)
+                self.number_frames = features.shape[0]
+                break
+
+        self.current_frame = np.random.randint(0, self.number_frames - self.sequence_length)
+        self.out = features[self.current_frame: self.current_frame + self.sequence_length]
+        return torch.from_numpy(self.out), torch.tensor(emotion)  
--- a/network.py
+++ b/network.py
@@ -19,10 +19,11 @@ class NetIemocap(nn.Module):
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first=True) #lstm

-        self.fc_1 =  nn.Linear(hidden_size, 128) #fully connected 1
-        self.fc = nn.Linear(128, num_classes) #fully connected last layer
-
+        self.fc_1 =  nn.Linear(hidden_size, 128) # fully connected 1
        self.relu = nn.ReLU()
+        self.fc = nn.Linear(128, num_classes) # fully connected last layer
+
+        
    
    def forward(self,x):
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() #hidden state

--- a/network_v2.py
+++ b/network_v2.py
+import torch
+import torch.nn as nn
+from einops import rearrange
+from positional_encodings.torch_encodings import PositionalEncoding1D, Summer
+
+
+class Query2Label(nn.Module):
+    """Modified Query2Label model
+    Unlike the model described in the paper (which uses a modified DETR
+    transformer), this version uses a standard, unmodified Pytorch Transformer.
+    Learnable label embeddings are passed to the decoder module as the target
+    sequence (and ultimately is passed as the Query to MHA).
+    """
+
+    def __init__(
+            self, num_classes,
+            nheads=8,
+            hidden_dim: int = None,
+            encoder_layers=1,
+            decoder_layers=1,
+            use_pos_encoding=True):
+        """Initializes model
+        Args:
+            model (str): Timm model descriptor for backbone.
+            conv_out (int): Backbone output channels.
+            num_classes (int): Number of possible label classes
+            hidden_dim (int, optional): Hidden channels from linear projection of
+            backbone output. Defaults to 256.
+            nheads (int, optional): Number of MHA heads. Defaults to 8.
+            encoder_layers (int, optional): Number of encoders. Defaults to 6.
+            decoder_layers (int, optional): Number of decoders. Defaults to 6.
+            use_pos_encoding (bool, optional): Flag for use of position encoding.
+            Defaults to False.
+        """
+
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.use_pos_encoding = use_pos_encoding
+        self.hidden_dim = hidden_dim
+        self.transformer = nn.Transformer(self.hidden_dim, nheads, encoder_layers, decoder_layers)
+
+        if self.use_pos_encoding:
+            # returns the encoding object
+            self.pos_encoder = PositionalEncoding1D(self.hidden_dim)
+            # returns the summing object
+            self.encoding_adder = Summer(self.pos_encoder)
+
+        # prediction head
+        self.classifier = nn.Linear(num_classes * self.hidden_dim, num_classes)
+
+        # learnable label embedding
+        self.label_emb = nn.Parameter(torch.rand(1, num_classes, self.hidden_dim))
+
+    def forward(self, features):
+        """Passes batch through network
+        Args:
+            x (Tensor): Batch of features (melspec, wav2ev ...)
+        Returns:
+            Tensor: Output of classification head
+        """
+
+        # add position encodings
+        if self.use_pos_encoding:
+            # input with encoding added
+            features = self.encoding_adder(features)
+
+        features = rearrange(features, 'b t c -> t b c')
+        B = features.shape[1]
+
+        # image feature vector "h" is sent in after transformation above; we
+        # also convert label_emb from [1 x TARGET x (hidden)EMBED_SIZE] to
+        # [TARGET x BATCH_SIZE x (hidden)EMBED_SIZE]
+        label_emb = self.label_emb.repeat(B, 1, 1)
+        label_emb = label_emb.transpose(0, 1)
+        h = self.transformer(features, label_emb).transpose(0, 1)
+
+        # output from transformer was of dim [TARGET x BATCH_SIZE x EMBED_SIZE];
+        # however, we transposed it to [BATCH_SIZE x TARGET x EMBED_SIZE] above.
+        # below we reshape to [BATCH_SIZE x TARGET*EMBED_SIZE].
+        #
+        # next, we project transformer outputs to class labels
+        h = torch.reshape(h, (B, self.num_classes * self.hidden_dim))
+
+        return self.classifier(h)
--- a/preprocess.py
+++ b/preprocess.py
 import numpy as np
 from tqdm import tqdm
-from dataset import IEMOCAPDataset 
+from dataset import IEMOCAPDataset as IEMOCAPDataset_v1
+from dataset_v2 import IEMOCAPDataset as IEMOCAPDataset_v2

-def preprocess(preprocessed_path, df, features = None, session_to_test= None, train= None):
-    x_features_train = IEMOCAPDataset(data_root=df, features_name=features, session_to_test=session_to_test, train=train)
+def preprocess(version, preprocessed_path, df, features = None, session_to_test= None, train= None, root_path=None, wa2v_weights_path=None):
+    if version == 1:
+        x_features_train = IEMOCAPDataset_v1(data_root=df, features_name=features, session_to_test=session_to_test, train=train, root_path=root_path, wa2v_weights_path=wa2v_weights_path)
+    elif version == 2:
+        x_features_train = IEMOCAPDataset_v2(data_root=df, features_name=features, session_to_test=session_to_test, train=train, root_path=root_path, wa2v_weights_path=wa2v_weights_path)
+    
    len = x_features_train.__len__()
    
    all_data = []
    for i in tqdm(range(len)):
        line = x_features_train.iemocap_table["wav_path"].iloc[i]
-        audio, sr = x_features_train.load_wav(line)
+        wav_path = root_path + "/" + line
+        audio, sr = x_features_train.load_wav(wav_path)
        data = x_features_train.extract_features(audio, sr).transpose()
        all_data.append(data)

    np.save(f"{preprocessed_path}/{features}-session_to_test_{session_to_test}-train_{train}.npy", np.array(all_data))
- 
\ No newline at end of file
+
No results found