initial commit

2024-04-24 06:57:30 +04:00
commit 3633aa99e5
29 changed files with 2555 additions and 0 deletions
--- a/emotalk_own/Dockerfile
+++ b/emotalk_own/Dockerfile
@@ -0,0 +1,54 @@
+FROM nvidia/cudagl:11.3.1-devel-ubuntu20.04
+MAINTAINER "Jungwoo Choi"
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Asia/Seoul
+
+ADD requirements.txt /tmp/requirements.txt
+RUN \
+    # Fix CUDA apt error
+    rm -f /etc/apt/sources.list.d/cuda.list  && \
+    rm -f /etc/apt/sources.list.d/nvidia-ml.list  && \
+    apt-get update && apt-get install -y gnupg2 software-properties-common && \
+    apt-key del 7fa2af80  && \
+    apt-get update && apt-get install -y --no-install-recommends wget  && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb  && \
+    dpkg -i cuda-keyring_1.0-1_all.deb  && \
+    # Install Start 
+    apt update  && \
+    add-apt-repository -y ppa:savoury1/ffmpeg4 && \
+    apt -y install python3.8 python3.8-distutils libgl1-mesa-glx libglib2.0-0 git wget zsh vim openssh-server curl ffmpeg && \
+    # Python Library 
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py && \
+    pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113  && \
+    pip install -r /tmp/requirements.txt  && \
+    # zsh option
+    chsh -s /bin/zsh  && \
+    sh -c "$(wget -O- https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)"  && \
+    # add zsh-autosuggestions, zsh-syntax-highlighting plugin
+    git clone https://github.com/zsh-users/zsh-autosuggestions ~/.oh-my-zsh/custom/plugins/zsh-autosuggestions  && \
+    git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ~/.oh-my-zsh/custom/plugins/zsh-syntax-highlighting  && \
+    # Modify .zshrc whth Perl
+    perl -pi -w -e 's/ZSH_THEME=.*/ZSH_THEME="af-magic"/g;' ~/.zshrc  && \
+    perl -pi -w -e 's/plugins=.*/plugins=(git ssh-agent zsh-autosuggestions zsh-syntax-highlighting)/g;' ~/.zshrc  && \
+    # Set ssh id and password, default is id = root, password = root.
+    # I recommand changing this for more security
+    # PermitRootLogin : yes - for ssh connection
+    echo 'root:root' |chpasswd  && \
+    sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config  && \
+    sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config  && \
+    mkdir /root/.ssh  && \
+    mkdir /var/run/sshd   && \
+    # install language pack for timeline issue.
+    apt-get install -y language-pack-en && update-locale  && \
+    # Clean up
+    apt-get clean  && \
+    apt-get autoclean  && \
+    apt-get autoremove -y  && \
+    rm -rf /var/lib/cache/*  && \
+    rm -rf /var/lib/log/*
+
+WORKDIR /workspace
+CMD ["echo", "nvidia/cudagl:11.3.1-devel-ubuntu20.04 is ready!", 'zsh']
--- a/emotalk_own/LICENSE
+++ b/emotalk_own/LICENSE
@@ -0,0 +1,13 @@
+Copyright (c) 2023 Psyche AI Inc.
+
+This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License (CC BY-NC 4.0). To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, and distribute the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+1. Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
+
+2. NonCommercial — You may not use the material for commercial purposes.
+
+3. No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/emotalk_own/blender.sh
+++ b/emotalk_own/blender.sh
@@ -0,0 +1,4 @@
+wget https://ftp.nluug.nl/pub/graphics/blender/release/Blender3.4/blender-3.4.1-linux-x64.tar.xz
+tar -xf blender-3.4.1-linux-x64.tar.xz
+mv blender-3.4.1-linux-x64 blender && rm blender-3.4.1-linux-x64.tar.xz
+
--- a/emotalk_own/demo.py
+++ b/emotalk_own/demo.py
@@ -0,0 +1,111 @@
+import librosa
+import numpy as np
+import argparse
+from scipy.signal import savgol_filter
+import torch
+from model import EmoTalk
+import random
+import os, subprocess
+import shlex
+
+
+@torch.no_grad()
+def test(args):
+    result_path = args.result_path
+    os.makedirs(result_path, exist_ok=True)
+    eye1 = np.array([0.36537236, 0.950235724, 0.95593375, 0.916715622, 0.367256105, 0.119113259, 0.025357503])
+    eye2 = np.array([0.234776169, 0.909951985, 0.944758058, 0.777862132, 0.191071674, 0.235437036, 0.089163929])
+    eye3 = np.array([0.870040774, 0.949833691, 0.949418545, 0.695911646, 0.191071674, 0.072576277, 0.007108896])
+    eye4 = np.array([0.000307991, 0.556701422, 0.952656746, 0.942345619, 0.425857186, 0.148335218, 0.017659493])
+    model = EmoTalk(args)
+    model.load_state_dict(torch.load(args.model_path, map_location=torch.device(args.device)), strict=False)
+    model = model.to(args.device)
+    model.eval()
+    wav_path = args.wav_path
+    file_name = wav_path.split('/')[-1].split('.')[0]
+    speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000)
+    audio = torch.FloatTensor(speech_array).unsqueeze(0).to(args.device)
+    level = torch.tensor([1]).to(args.device)
+    person = torch.tensor([0]).to(args.device)
+    prediction = model.predict(audio, level, person)
+    prediction = prediction.squeeze().detach().cpu().numpy()
+    if args.post_processing:
+        output = np.zeros((prediction.shape[0], prediction.shape[1]))
+        for i in range(prediction.shape[1]):
+            output[:, i] = savgol_filter(prediction[:, i], 5, 2)
+        output[:, 8] = 0
+        output[:, 9] = 0
+        i = random.randint(0, 60)
+        while i < output.shape[0] - 7:
+            eye_num = random.randint(1, 4)
+            if eye_num == 1:
+                output[i:i + 7, 8] = eye1
+                output[i:i + 7, 9] = eye1
+            elif eye_num == 2:
+                output[i:i + 7, 8] = eye2
+                output[i:i + 7, 9] = eye2
+            elif eye_num == 3:
+                output[i:i + 7, 8] = eye3
+                output[i:i + 7, 9] = eye3
+            else:
+                output[i:i + 7, 8] = eye4
+                output[i:i + 7, 9] = eye4
+            time1 = random.randint(60, 180)
+            i = i + time1
+        np.save(os.path.join(result_path, "{}.npy".format(file_name)), output)  # with postprocessing (smoothing and blinking)
+    else:
+        np.save(os.path.join(result_path, "{}.npy".format(file_name)), prediction)  # without post-processing
+
+
+def render_video(args):
+    wav_name = args.wav_path.split('/')[-1].split('.')[0]
+    image_path = os.path.join(args.result_path, wav_name)
+    os.makedirs(image_path, exist_ok=True)
+    image_temp = image_path + "/%d.png"
+    output_path = os.path.join(args.result_path, wav_name + ".mp4")
+    blender_path = args.blender_path
+    python_path = "./render.py"
+    blend_path = "./render.blend"
+    cmd = '{} -t 64 -b {} -P {} -- "{}" "{}" '.format(blender_path, blend_path, python_path, args.result_path, wav_name)
+    cmd = shlex.split(cmd)
+    p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    while p.poll() is None:
+        line = p.stdout.readline()
+        line = line.strip()
+        if line:
+            print('[{}]'.format(line))
+    if p.returncode == 0:
+        print('Subprogram success')
+    else:
+        print('Subprogram failed')
+
+    cmd = 'ffmpeg -r 30 -i "{}" -i "{}" -pix_fmt yuv420p -s 512x768 "{}" -y'.format(image_temp, args.wav_path, output_path)
+    subprocess.call(cmd, shell=True)
+
+    cmd = 'rm -rf "{}"'.format(image_path)
+    subprocess.call(cmd, shell=True)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='EmoTalk: Speech-driven Emotional Disentanglement for 3D Face Animation')
+    parser.add_argument("--wav_path", type=str, default="./audio/angry1.wav", help='path of the test data')
+    parser.add_argument("--bs_dim", type=int, default=52, help='number of blendshapes:52')
+    parser.add_argument("--feature_dim", type=int, default=832, help='number of feature dim')
+    parser.add_argument("--period", type=int, default=30, help='number of period')
+    parser.add_argument("--device", type=str, default="cuda", help='device')
+    parser.add_argument("--model_path", type=str, default="./pretrain_model/EmoTalk.pth",
+                        help='path of the trained models')
+    parser.add_argument("--result_path", type=str, default="./result/", help='path of the result')
+    parser.add_argument("--max_seq_len", type=int, default=5000, help='max sequence length')
+    parser.add_argument("--num_workers", type=int, default=0)
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--post_processing", type=bool, default=True, help='whether to use post processing')
+    parser.add_argument("--blender_path", type=str, default="./blender/blender", help='path of blender')
+    args = parser.parse_args()
+    test(args)
+    render_video(args)
+
+
+if __name__ == "__main__":
+    main()
--- a/emotalk_own/model.py
+++ b/emotalk_own/model.py
@@ -0,0 +1,144 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor
+from wav2vec import Wav2Vec2Model, Wav2Vec2ForSpeechClassification
+from utils import init_biased_mask, enc_dec_mask
+
+
+class EmoTalk(nn.Module):
+    def __init__(self, args):
+        super(EmoTalk, self).__init__()
+        self.feature_dim = args.feature_dim
+        self.bs_dim = args.bs_dim
+        self.device = args.device
+        self.batch_size = args.batch_size
+        self.audio_encoder_cont = Wav2Vec2Model.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
+        self.processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
+        self.audio_encoder_cont.feature_extractor._freeze_parameters()
+        self.audio_encoder_emo = Wav2Vec2ForSpeechClassification.from_pretrained(
+            "r-f/wav2vec-english-speech-emotion-recognition")
+        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            "r-f/wav2vec-english-speech-emotion-recognition")
+        self.audio_encoder_emo.wav2vec2.feature_extractor._freeze_parameters()
+        self.max_seq_len = args.max_seq_len
+        self.audio_feature_map_cont = nn.Linear(1024, 512)
+        self.audio_feature_map_emo = nn.Linear(1024, 832)
+        self.audio_feature_map_emo2 = nn.Linear(832, 256)
+        self.relu = nn.ReLU()
+        self.biased_mask1 = init_biased_mask(n_head=4, max_seq_len=args.max_seq_len, period=args.period)
+        self.one_hot_level = np.eye(2)
+        self.obj_vector_level = nn.Linear(2, 32)
+        self.one_hot_person = np.eye(24)
+        self.obj_vector_person = nn.Linear(24, 32)
+        decoder_layer = nn.TransformerDecoderLayer(d_model=args.feature_dim, nhead=4, dim_feedforward=args.feature_dim,
+                                                   batch_first=True)
+        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=1)
+        self.bs_map_r = nn.Linear(self.feature_dim, self.bs_dim)
+        nn.init.constant_(self.bs_map_r.weight, 0)
+        nn.init.constant_(self.bs_map_r.bias, 0)
+
+    def forward(self, data):
+        frame_num11 = data["target11"].shape[1]
+        frame_num12 = data["target12"].shape[1]
+        inputs12 = self.processor(torch.squeeze(data["input12"]), sampling_rate=16000, return_tensors="pt",
+                                  padding="longest").input_values.to(self.device)
+        hidden_states_cont1 = self.audio_encoder_cont(inputs12, frame_num=frame_num11).last_hidden_state
+        hidden_states_cont12 = self.audio_encoder_cont(inputs12, frame_num=frame_num12).last_hidden_state
+        inputs21 = self.feature_extractor(torch.squeeze(data["input21"]), sampling_rate=16000, padding=True,
+                                          return_tensors="pt").input_values.to(self.device)
+        inputs12 = self.feature_extractor(torch.squeeze(data["input12"]), sampling_rate=16000, padding=True,
+                                          return_tensors="pt").input_values.to(self.device)
+
+        output_emo1 = self.audio_encoder_emo(inputs21, frame_num=frame_num11)
+        output_emo2 = self.audio_encoder_emo(inputs12, frame_num=frame_num12)
+
+        hidden_states_emo1 = output_emo1.hidden_states
+        hidden_states_emo2 = output_emo2.hidden_states
+
+        label1 = output_emo1.logits
+        onehot_level = self.one_hot_level[data["level"]]
+        onehot_level = torch.from_numpy(onehot_level).to(self.device).float()
+        onehot_person = self.one_hot_person[data["person"]]
+        onehot_person = torch.from_numpy(onehot_person).to(self.device).float()
+        if data["target11"].shape[0] == 1:
+            obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0)
+            obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0)
+        else:
+            obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0).permute(1, 0, 2)
+            obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0).permute(1, 0, 2)
+
+        obj_embedding_level11 = obj_embedding_level.repeat(1, frame_num11, 1)
+        obj_embedding_level12 = obj_embedding_level.repeat(1, frame_num12, 1)
+        obj_embedding_person11 = obj_embedding_person.repeat(1, frame_num11, 1)
+        obj_embedding_person12 = obj_embedding_person.repeat(1, frame_num12, 1)
+        hidden_states_cont1 = self.audio_feature_map_cont(hidden_states_cont1)
+        hidden_states_emo11_832 = self.audio_feature_map_emo(hidden_states_emo1)
+        hidden_states_emo11_256 = self.relu(self.audio_feature_map_emo2(hidden_states_emo11_832))
+
+        hidden_states11 = torch.cat(
+            [hidden_states_cont1, hidden_states_emo11_256, obj_embedding_level11, obj_embedding_person11], dim=2)
+        hidden_states_cont12 = self.audio_feature_map_cont(hidden_states_cont12)
+        hidden_states_emo12_832 = self.audio_feature_map_emo(hidden_states_emo2)
+        hidden_states_emo12_256 = self.relu(self.audio_feature_map_emo2(hidden_states_emo12_832))
+
+        hidden_states12 = torch.cat(
+            [hidden_states_cont12, hidden_states_emo12_256, obj_embedding_level12, obj_embedding_person12], dim=2)
+        if data["target11"].shape[0] == 1:
+            tgt_mask11 = self.biased_mask1[:, :hidden_states11.shape[1], :hidden_states11.shape[1]].clone().detach().to(
+                device=self.device)
+            tgt_mask22 = self.biased_mask1[:, :hidden_states12.shape[1], :hidden_states12.shape[1]].clone().detach().to(
+                device=self.device)
+
+        memory_mask11 = enc_dec_mask(self.device, hidden_states11.shape[1], hidden_states11.shape[1])
+        memory_mask12 = enc_dec_mask(self.device, hidden_states12.shape[1], hidden_states12.shape[1])
+        bs_out11 = self.transformer_decoder(hidden_states11, hidden_states_emo11_832, tgt_mask=tgt_mask11,
+                                            memory_mask=memory_mask11)
+        bs_out12 = self.transformer_decoder(hidden_states12, hidden_states_emo12_832, tgt_mask=tgt_mask22,
+                                            memory_mask=memory_mask12)
+        bs_output11 = self.bs_map_r(bs_out11)
+        bs_output12 = self.bs_map_r(bs_out12)
+
+        return bs_output11, bs_output12, label1
+
+    def predict(self, audio, level, person):
+        frame_num11 = math.ceil(audio.shape[1] / 16000 * 30)
+        inputs12 = self.processor(torch.squeeze(audio), sampling_rate=16000, return_tensors="pt",
+                                  padding="longest").input_values.to(self.device)
+        hidden_states_cont1 = self.audio_encoder_cont(inputs12, frame_num=frame_num11).last_hidden_state
+        inputs12 = self.feature_extractor(torch.squeeze(audio), sampling_rate=16000, padding=True,
+                                          return_tensors="pt").input_values.to(self.device)
+        output_emo1 = self.audio_encoder_emo(inputs12, frame_num=frame_num11)
+        hidden_states_emo1 = output_emo1.hidden_states
+
+        onehot_level = self.one_hot_level[level]
+        onehot_level = torch.from_numpy(onehot_level).to(self.device).float()
+        onehot_person = self.one_hot_person[person]
+        onehot_person = torch.from_numpy(onehot_person).to(self.device).float()
+        if audio.shape[0] == 1:
+            obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0)
+            obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0)
+        else:
+            obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0).permute(1, 0, 2)
+            obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0).permute(1, 0, 2)
+
+        obj_embedding_level11 = obj_embedding_level.repeat(1, frame_num11, 1)
+        obj_embedding_person11 = obj_embedding_person.repeat(1, frame_num11, 1)
+        hidden_states_cont1 = self.audio_feature_map_cont(hidden_states_cont1)
+        hidden_states_emo11_832 = self.audio_feature_map_emo(hidden_states_emo1)
+        hidden_states_emo11_256 = self.relu(
+            self.audio_feature_map_emo2(hidden_states_emo11_832))
+
+        hidden_states11 = torch.cat(
+            [hidden_states_cont1, hidden_states_emo11_256, obj_embedding_level11, obj_embedding_person11], dim=2)
+        if audio.shape[0] == 1:
+            tgt_mask11 = self.biased_mask1[:, :hidden_states11.shape[1],
+                         :hidden_states11.shape[1]].clone().detach().to(device=self.device)
+
+        memory_mask11 = enc_dec_mask(self.device, hidden_states11.shape[1], hidden_states11.shape[1])
+        bs_out11 = self.transformer_decoder(hidden_states11, hidden_states_emo11_832, tgt_mask=tgt_mask11,
+                                            memory_mask=memory_mask11)
+        bs_output11 = self.bs_map_r(bs_out11)
+
+        return bs_output11
--- a/emotalk_own/readme.md
+++ b/emotalk_own/readme.md
@@ -0,0 +1,103 @@
+![Psyche AI Inc release](./media/psy_logo.png)
+
+# EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation [ICCV2023]
+
+Official PyTorch implementation for the paper:
+
+> **EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation**, ***ICCV 2023***.
+>
+> Ziqiao Peng, Haoyu Wu, Zhenbo Song, Hao Xu, Xiangyu Zhu, Jun He, Hongyan Liu, Zhaoxin Fan
+>
+>  [Arxiv](https://arxiv.org/abs/2303.11089) | [Project Page](https://ziqiaopeng.github.io/emotalk/) | [License](https://github.com/psyai-net/EmoTalk_release/blob/main/LICENSE)
+
+
+
+<p align="center">
+<img src="./media/emotalk.png" width="90%" />
+</p>
+
+> Given audio input expressing different emotions, EmoTalk produces realistic 3D facial animation sequences with corresponding emotional expressions as outputs.
+## News
+- `2023.10.17` Thanks to [noirmist](https://github.com/noirmist)! Now you can create the environment via docker.
+## Environment
+
+- Linux
+- Python 3.8.8
+- Pytorch 1.12.1
+- CUDA 11.3
+- Blender 3.4.1
+- ffmpeg 4.4.1
+
+Clone the repo:
+  ```bash
+  git clone https://github.com/psyai-net/EmoTalk_release.git
+  cd EmoTalk_release
+  ```  
+Create conda environment:
+```bash
+conda create -n emotalk python=3.8.8
+conda activate emotalk
+pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
+pip install -r requirements.txt
+```
+
+
+## **Demo**
+Download Blender and put it in this directory.
+```bash
+wget https://ftp.nluug.nl/pub/graphics/blender/release/Blender3.4/blender-3.4.1-linux-x64.tar.xz
+tar -xf blender-3.4.1-linux-x64.tar.xz
+mv blender-3.4.1-linux-x64 blender && rm blender-3.4.1-linux-x64.tar.xz
+```
+Download the pretrained models from [EmoTalk.pth](https://drive.google.com/file/d/1KQZ-WGI9VDFLqgNXvJQosKVCbjTaCPqK/view?usp=drive_link) (Updated). Put the pretrained models under `pretrain_model` folder. 
+Put the audio under `aduio` folder and run
+```bash
+python demo.py --wav_path "./audio/disgust.wav"
+```
+The generated animation will be saved in `result` folder.
+
+
+## **Dataset**
+If someone wants to download the 3D-ETF dataset, please fill in the [agreement](https://drive.google.com/file/d/1AQ5_focSgw9WiJdA2R44BQOrdTUe2ABd/view?usp=drive_link), and use the education mailbox to email Ziqiao Peng (pengziqiao@ruc.edu.cn) and cc Zhaoxin Fan (fanzhaoxin@psyai.net) to request the download link.
+
+## **Citation**
+If you find this work useful for your research, please cite our paper:
+```
+@InProceedings{Peng_2023_ICCV,
+    author    = {Peng, Ziqiao and Wu, Haoyu and Song, Zhenbo and Xu, Hao and Zhu, Xiangyu and He, Jun and Liu, Hongyan and Fan, Zhaoxin},
+    title     = {EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation},
+    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+    month     = {October},
+    year      = {2023},
+    pages     = {20687-20697}
+}
+```
+
+## **Acknowledgement**
+Here are some great resources we benefit:
+- [Faceformer](https://github.com/EvelynFan/FaceFormer) for training pipeline
+- [EVP](https://github.com/jixinya/EVP) for training dataloader
+- [Speech-driven-expressions](https://github.com/YoungSeng/Speech-driven-expressions) for rendering
+- [Wav2Vec2 Content](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english) and [Wav2Vec2 Emotion](https://huggingface.co/r-f/wav2vec-english-speech-emotion-recognition) for audio encoder
+- [Head Template](http://filmicworlds.com/blog/solving-face-scans-for-arkit/) for visualization.
+
+Thanks to John Hable for sharing his head template under the CC0 license, which is very helpful for us to visualize the results.
+
+## **Contact**
+For research purpose, such as comparison of experimental results, please contact pengziqiao@ruc.edu.cn
+
+For commercial licensing, please contact fanzhaoxin@psyai.net
+
+## **License**
+This project is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. Please read the [LICENSE](LICENSE) file for more information.
+
+## **Invitation**
+
+We invite you to join [Psyche AI Inc](https://www.psyai.com/home) to conduct cutting-edge research and business implementation together. At Psyche AI Inc, we are committed to pushing the boundaries of what's possible in the fields of artificial intelligence and computer vision, especially their applications in avatars. As a member of our team, you will have the opportunity to collaborate with talented individuals, innovate new ideas, and contribute to projects that have a real-world impact.
+
+If you are passionate about working on the forefront of technology and making a difference, we would love to hear from you. Please visit our website at [Psyche AI Inc](https://www.psyai.com/home) to learn more about us and to apply for open positions. You can also contact us by fanzhaoxin@psyai.net.
+
+Let's shape the future together!!
+
+
+
--- a/emotalk_own/render.py
+++ b/emotalk_own/render.py
@@ -0,0 +1,87 @@
+import bpy
+import os
+import numpy as np
+import sys
+
+filename = str(sys.argv[-1])
+root_dir = str(sys.argv[-2])
+
+model_bsList = ["browDownLeft",
+                "browDownRight",
+                "browInnerUp",
+                "browOuterUpLeft",
+                "browOuterUpRight",
+                "cheekPuff",
+                "cheekSquintLeft",
+                "cheekSquintRight",
+                "eyeBlinkLeft",
+                "eyeBlinkRight",
+                "eyeLookDownLeft",
+                "eyeLookDownRight",
+                "eyeLookInLeft",
+                "eyeLookInRight",
+                "eyeLookOutLeft",
+                "eyeLookOutRight",
+                "eyeLookUpLeft",
+                "eyeLookUpRight",
+                "eyeSquintLeft",
+                "eyeSquintRight",
+                "eyeWideLeft",
+                "eyeWideRight",
+                "jawForward",
+                "jawLeft",
+                "jawOpen",
+                "jawRight",
+                "mouthClose",
+                "mouthDimpleLeft",
+                "mouthDimpleRight",
+                "mouthFrownLeft",
+                "mouthFrownRight",
+                "mouthFunnel",
+                "mouthLeft",
+                "mouthLowerDownLeft",
+                "mouthLowerDownRight",
+                "mouthPressLeft",
+                "mouthPressRight",
+                "mouthPucker",
+                "mouthRight",
+                "mouthRollLower",
+                "mouthRollUpper",
+                "mouthShrugLower",
+                "mouthShrugUpper",
+                "mouthSmileLeft",
+                "mouthSmileRight",
+                "mouthStretchLeft",
+                "mouthStretchRight",
+                "mouthUpperUpLeft",
+                "mouthUpperUpRight",
+                "noseSneerLeft",
+                "noseSneerRight",
+                "tongueOut"]
+
+obj = bpy.data.objects["face"]
+
+bpy.context.scene.render.engine = 'BLENDER_WORKBENCH'
+bpy.context.scene.display.shading.light = 'MATCAP'
+bpy.context.scene.display.render_aa = 'FXAA'
+bpy.context.scene.render.resolution_x = int(512)
+bpy.context.scene.render.resolution_y = int(768)
+bpy.context.scene.render.fps = 30
+bpy.context.scene.render.image_settings.file_format = 'PNG'
+
+cam = bpy.data.objects['Camera']
+cam.scale = [2, 2, 2]
+bpy.context.scene.camera = cam
+
+output_dir = root_dir + filename
+blendshape_path = root_dir + filename + '.npy'
+
+result = []
+bs = np.load(blendshape_path)
+
+for i in range(bs.shape[0]):
+    curr_bs = bs[i]
+    for j in range(52):
+        obj.data.shape_keys.key_blocks[model_bsList[j]].value = curr_bs[j]
+    bpy.context.scene.render.filepath = os.path.join(output_dir, '{}.png'.format(i))
+    bpy.ops.render.render(write_still=True)
--- a/emotalk_own/requirements.txt
+++ b/emotalk_own/requirements.txt
@@ -0,0 +1,5 @@
+numpy~=1.21.6
+transformers~=4.26.0
+tqdm~=4.64.1
+librosa~=0.10.0
+scipy~=1.9.1
--- a/emotalk_own/utils.py
+++ b/emotalk_own/utils.py
@@ -0,0 +1,39 @@
+# Borrowed from https://github.com/EvelynFan/FaceFormer/blob/main/faceformer.py
+import torch
+import math
+
+
+# Temporal Bias
+def init_biased_mask(n_head, max_seq_len, period):
+    def get_slopes(n):
+        def get_slopes_power_of_2(n):
+            start = (2 ** (-2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio ** i for i in range(n)]
+
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
+                                                               :n - closest_power_of_2]
+
+    slopes = torch.Tensor(get_slopes(n_head))
+    bias = torch.arange(start=0, end=max_seq_len, step=period).unsqueeze(1).repeat(1, period).view(-1) // (period)
+    bias = - torch.flip(bias, dims=[0])
+    alibi = torch.zeros(max_seq_len, max_seq_len)
+    for i in range(max_seq_len):
+        alibi[i, :i + 1] = bias[-(i + 1):]
+    alibi = slopes.unsqueeze(1).unsqueeze(1) * alibi.unsqueeze(0)
+    mask = (torch.triu(torch.ones(max_seq_len, max_seq_len)) == 1).transpose(0, 1)
+    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+    mask = mask.unsqueeze(0) + alibi
+    return mask
+
+
+# Alignment Bias
+def enc_dec_mask(device, T, S):
+    mask = torch.ones(T, S).to(device)
+    for i in range(T):
+        mask[i, i] = 0
+    return (mask == 1).to(device=device)
--- a/emotalk_own/wav2vec.py
+++ b/emotalk_own/wav2vec.py
@@ -0,0 +1,245 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from dataclasses import dataclass
+from transformers import Wav2Vec2Model, Wav2Vec2PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutput
+from typing import Optional, Tuple
+from transformers.file_utils import ModelOutput
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+_CONFIG_FOR_DOC = "Wav2Vec2Config"
+_HIDDEN_STATES_START_POSITION = 2
+
+
+# the implementation of Wav2Vec2Model is borrowed from https://huggingface.co/transformers/_modules/transformers/models/wav2vec2/modeling_wav2vec2.html#Wav2Vec2Model
+# initialize our encoder with the pre-trained wav2vec 2.0 weights.
+def _compute_mask_indices(
+        shape: Tuple[int, int],
+        mask_prob: float,
+        mask_length: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        min_masks: int = 0,
+) -> np.ndarray:
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+
+    all_num_mask = int(
+        mask_prob * all_sz / float(mask_length)
+        + np.random.rand()
+    )
+    all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                mask_prob * sz / float(mask_length)
+                + np.random.rand()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+
+        lengths = np.full(num_mask, mask_length)
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        min_len = min(lengths)
+        if sz - min_len <= num_mask:
+            min_len = sz - num_mask - 1
+
+        mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+        mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])])
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+    return mask
+
+
+# linear interpolation layer
+def linear_interpolation(features, input_fps, output_fps, output_len=None):
+    features = features.transpose(1, 2)
+    seq_len = features.shape[2] / float(input_fps)
+    if output_len is None:
+        output_len = int(seq_len * output_fps)
+    output_features = F.interpolate(features, size=output_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)
+
+
+class Wav2Vec2Model(Wav2Vec2Model):
+    def __init__(self, config):
+        super().__init__(config)
+        self.lm_head = nn.Linear(1024, 32)
+
+    def forward(
+            self,
+            input_values,
+            attention_mask=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            frame_num=None
+    ):
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.feature_extractor(input_values)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = linear_interpolation(hidden_states, 50, 30, output_len=frame_num)
+
+        if attention_mask is not None:
+            output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+            attention_mask = torch.zeros(
+                hidden_states.shape[:2], dtype=hidden_states.dtype, device=hidden_states.device
+            )
+            attention_mask[
+                (torch.arange(attention_mask.shape[0], device=hidden_states.device), output_lengths - 1)
+            ] = 1
+            attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+
+        hidden_states = self.feature_projection(hidden_states)[0]
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@dataclass
+class SpeechClassifierOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class Wav2Vec2ClassificationHead(nn.Module):
+    """Head for wav2vec classification task."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.pooling_mode = config.pooling_mode
+        self.config = config
+
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = Wav2Vec2ClassificationHead(config)
+
+        self.init_weights()
+
+    def freeze_feature_extractor(self):
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    def merged_strategy(
+            self,
+            hidden_states,
+            mode="mean"
+    ):
+        if mode == "mean":
+            outputs = torch.mean(hidden_states, dim=1)
+        elif mode == "sum":
+            outputs = torch.sum(hidden_states, dim=1)
+        elif mode == "max":
+            outputs = torch.max(hidden_states, dim=1)[0]
+        else:
+            raise Exception(
+                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
+
+        return outputs
+
+    def forward(
+            self,
+            input_values,
+            attention_mask=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            labels=None,
+            frame_num=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states1 = linear_interpolation(hidden_states, 50, 30, output_len=frame_num)
+        hidden_states = self.merged_strategy(hidden_states1, mode=self.pooling_mode)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SpeechClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=hidden_states1,
+            attentions=outputs.attentions,
+        )