initial commit

This commit is contained in:
George Kasparyants
2024-04-24 06:57:30 +04:00
commit 3633aa99e5
29 changed files with 2555 additions and 0 deletions

54
emotalk_own/Dockerfile Normal file
View File

@@ -0,0 +1,54 @@
FROM nvidia/cudagl:11.3.1-devel-ubuntu20.04
MAINTAINER "Jungwoo Choi"
ARG DEBIAN_FRONTEND=noninteractive
ENV TZ=Asia/Seoul
ADD requirements.txt /tmp/requirements.txt
RUN \
# Fix CUDA apt error
rm -f /etc/apt/sources.list.d/cuda.list && \
rm -f /etc/apt/sources.list.d/nvidia-ml.list && \
apt-get update && apt-get install -y gnupg2 software-properties-common && \
apt-key del 7fa2af80 && \
apt-get update && apt-get install -y --no-install-recommends wget && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb && \
# Install Start
apt update && \
add-apt-repository -y ppa:savoury1/ffmpeg4 && \
apt -y install python3.8 python3.8-distutils libgl1-mesa-glx libglib2.0-0 git wget zsh vim openssh-server curl ffmpeg && \
# Python Library
update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
wget https://bootstrap.pypa.io/get-pip.py && \
python3 get-pip.py && \
pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113 && \
pip install -r /tmp/requirements.txt && \
# zsh option
chsh -s /bin/zsh && \
sh -c "$(wget -O- https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" && \
# add zsh-autosuggestions, zsh-syntax-highlighting plugin
git clone https://github.com/zsh-users/zsh-autosuggestions ~/.oh-my-zsh/custom/plugins/zsh-autosuggestions && \
git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ~/.oh-my-zsh/custom/plugins/zsh-syntax-highlighting && \
# Modify .zshrc whth Perl
perl -pi -w -e 's/ZSH_THEME=.*/ZSH_THEME="af-magic"/g;' ~/.zshrc && \
perl -pi -w -e 's/plugins=.*/plugins=(git ssh-agent zsh-autosuggestions zsh-syntax-highlighting)/g;' ~/.zshrc && \
# Set ssh id and password, default is id = root, password = root.
# I recommand changing this for more security
# PermitRootLogin : yes - for ssh connection
echo 'root:root' |chpasswd && \
sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && \
sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config && \
mkdir /root/.ssh && \
mkdir /var/run/sshd && \
# install language pack for timeline issue.
apt-get install -y language-pack-en && update-locale && \
# Clean up
apt-get clean && \
apt-get autoclean && \
apt-get autoremove -y && \
rm -rf /var/lib/cache/* && \
rm -rf /var/lib/log/*
WORKDIR /workspace
CMD ["echo", "nvidia/cudagl:11.3.1-devel-ubuntu20.04 is ready!", 'zsh']

13
emotalk_own/LICENSE Normal file
View File

@@ -0,0 +1,13 @@
Copyright (c) 2023 Psyche AI Inc.
This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License (CC BY-NC 4.0). To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, and distribute the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
1. Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
2. NonCommercial — You may not use the material for commercial purposes.
3. No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

4
emotalk_own/blender.sh Executable file
View File

@@ -0,0 +1,4 @@
wget https://ftp.nluug.nl/pub/graphics/blender/release/Blender3.4/blender-3.4.1-linux-x64.tar.xz
tar -xf blender-3.4.1-linux-x64.tar.xz
mv blender-3.4.1-linux-x64 blender && rm blender-3.4.1-linux-x64.tar.xz

111
emotalk_own/demo.py Normal file
View File

@@ -0,0 +1,111 @@
import librosa
import numpy as np
import argparse
from scipy.signal import savgol_filter
import torch
from model import EmoTalk
import random
import os, subprocess
import shlex
@torch.no_grad()
def test(args):
result_path = args.result_path
os.makedirs(result_path, exist_ok=True)
eye1 = np.array([0.36537236, 0.950235724, 0.95593375, 0.916715622, 0.367256105, 0.119113259, 0.025357503])
eye2 = np.array([0.234776169, 0.909951985, 0.944758058, 0.777862132, 0.191071674, 0.235437036, 0.089163929])
eye3 = np.array([0.870040774, 0.949833691, 0.949418545, 0.695911646, 0.191071674, 0.072576277, 0.007108896])
eye4 = np.array([0.000307991, 0.556701422, 0.952656746, 0.942345619, 0.425857186, 0.148335218, 0.017659493])
model = EmoTalk(args)
model.load_state_dict(torch.load(args.model_path, map_location=torch.device(args.device)), strict=False)
model = model.to(args.device)
model.eval()
wav_path = args.wav_path
file_name = wav_path.split('/')[-1].split('.')[0]
speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000)
audio = torch.FloatTensor(speech_array).unsqueeze(0).to(args.device)
level = torch.tensor([1]).to(args.device)
person = torch.tensor([0]).to(args.device)
prediction = model.predict(audio, level, person)
prediction = prediction.squeeze().detach().cpu().numpy()
if args.post_processing:
output = np.zeros((prediction.shape[0], prediction.shape[1]))
for i in range(prediction.shape[1]):
output[:, i] = savgol_filter(prediction[:, i], 5, 2)
output[:, 8] = 0
output[:, 9] = 0
i = random.randint(0, 60)
while i < output.shape[0] - 7:
eye_num = random.randint(1, 4)
if eye_num == 1:
output[i:i + 7, 8] = eye1
output[i:i + 7, 9] = eye1
elif eye_num == 2:
output[i:i + 7, 8] = eye2
output[i:i + 7, 9] = eye2
elif eye_num == 3:
output[i:i + 7, 8] = eye3
output[i:i + 7, 9] = eye3
else:
output[i:i + 7, 8] = eye4
output[i:i + 7, 9] = eye4
time1 = random.randint(60, 180)
i = i + time1
np.save(os.path.join(result_path, "{}.npy".format(file_name)), output) # with postprocessing (smoothing and blinking)
else:
np.save(os.path.join(result_path, "{}.npy".format(file_name)), prediction) # without post-processing
def render_video(args):
wav_name = args.wav_path.split('/')[-1].split('.')[0]
image_path = os.path.join(args.result_path, wav_name)
os.makedirs(image_path, exist_ok=True)
image_temp = image_path + "/%d.png"
output_path = os.path.join(args.result_path, wav_name + ".mp4")
blender_path = args.blender_path
python_path = "./render.py"
blend_path = "./render.blend"
cmd = '{} -t 64 -b {} -P {} -- "{}" "{}" '.format(blender_path, blend_path, python_path, args.result_path, wav_name)
cmd = shlex.split(cmd)
p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while p.poll() is None:
line = p.stdout.readline()
line = line.strip()
if line:
print('[{}]'.format(line))
if p.returncode == 0:
print('Subprogram success')
else:
print('Subprogram failed')
cmd = 'ffmpeg -r 30 -i "{}" -i "{}" -pix_fmt yuv420p -s 512x768 "{}" -y'.format(image_temp, args.wav_path, output_path)
subprocess.call(cmd, shell=True)
cmd = 'rm -rf "{}"'.format(image_path)
subprocess.call(cmd, shell=True)
def main():
parser = argparse.ArgumentParser(
description='EmoTalk: Speech-driven Emotional Disentanglement for 3D Face Animation')
parser.add_argument("--wav_path", type=str, default="./audio/angry1.wav", help='path of the test data')
parser.add_argument("--bs_dim", type=int, default=52, help='number of blendshapes:52')
parser.add_argument("--feature_dim", type=int, default=832, help='number of feature dim')
parser.add_argument("--period", type=int, default=30, help='number of period')
parser.add_argument("--device", type=str, default="cuda", help='device')
parser.add_argument("--model_path", type=str, default="./pretrain_model/EmoTalk.pth",
help='path of the trained models')
parser.add_argument("--result_path", type=str, default="./result/", help='path of the result')
parser.add_argument("--max_seq_len", type=int, default=5000, help='max sequence length')
parser.add_argument("--num_workers", type=int, default=0)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--post_processing", type=bool, default=True, help='whether to use post processing')
parser.add_argument("--blender_path", type=str, default="./blender/blender", help='path of blender')
args = parser.parse_args()
test(args)
render_video(args)
if __name__ == "__main__":
main()

144
emotalk_own/model.py Normal file
View File

@@ -0,0 +1,144 @@
import torch
import torch.nn as nn
import numpy as np
import math
from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor
from wav2vec import Wav2Vec2Model, Wav2Vec2ForSpeechClassification
from utils import init_biased_mask, enc_dec_mask
class EmoTalk(nn.Module):
def __init__(self, args):
super(EmoTalk, self).__init__()
self.feature_dim = args.feature_dim
self.bs_dim = args.bs_dim
self.device = args.device
self.batch_size = args.batch_size
self.audio_encoder_cont = Wav2Vec2Model.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
self.processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
self.audio_encoder_cont.feature_extractor._freeze_parameters()
self.audio_encoder_emo = Wav2Vec2ForSpeechClassification.from_pretrained(
"r-f/wav2vec-english-speech-emotion-recognition")
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
"r-f/wav2vec-english-speech-emotion-recognition")
self.audio_encoder_emo.wav2vec2.feature_extractor._freeze_parameters()
self.max_seq_len = args.max_seq_len
self.audio_feature_map_cont = nn.Linear(1024, 512)
self.audio_feature_map_emo = nn.Linear(1024, 832)
self.audio_feature_map_emo2 = nn.Linear(832, 256)
self.relu = nn.ReLU()
self.biased_mask1 = init_biased_mask(n_head=4, max_seq_len=args.max_seq_len, period=args.period)
self.one_hot_level = np.eye(2)
self.obj_vector_level = nn.Linear(2, 32)
self.one_hot_person = np.eye(24)
self.obj_vector_person = nn.Linear(24, 32)
decoder_layer = nn.TransformerDecoderLayer(d_model=args.feature_dim, nhead=4, dim_feedforward=args.feature_dim,
batch_first=True)
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=1)
self.bs_map_r = nn.Linear(self.feature_dim, self.bs_dim)
nn.init.constant_(self.bs_map_r.weight, 0)
nn.init.constant_(self.bs_map_r.bias, 0)
def forward(self, data):
frame_num11 = data["target11"].shape[1]
frame_num12 = data["target12"].shape[1]
inputs12 = self.processor(torch.squeeze(data["input12"]), sampling_rate=16000, return_tensors="pt",
padding="longest").input_values.to(self.device)
hidden_states_cont1 = self.audio_encoder_cont(inputs12, frame_num=frame_num11).last_hidden_state
hidden_states_cont12 = self.audio_encoder_cont(inputs12, frame_num=frame_num12).last_hidden_state
inputs21 = self.feature_extractor(torch.squeeze(data["input21"]), sampling_rate=16000, padding=True,
return_tensors="pt").input_values.to(self.device)
inputs12 = self.feature_extractor(torch.squeeze(data["input12"]), sampling_rate=16000, padding=True,
return_tensors="pt").input_values.to(self.device)
output_emo1 = self.audio_encoder_emo(inputs21, frame_num=frame_num11)
output_emo2 = self.audio_encoder_emo(inputs12, frame_num=frame_num12)
hidden_states_emo1 = output_emo1.hidden_states
hidden_states_emo2 = output_emo2.hidden_states
label1 = output_emo1.logits
onehot_level = self.one_hot_level[data["level"]]
onehot_level = torch.from_numpy(onehot_level).to(self.device).float()
onehot_person = self.one_hot_person[data["person"]]
onehot_person = torch.from_numpy(onehot_person).to(self.device).float()
if data["target11"].shape[0] == 1:
obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0)
obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0)
else:
obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0).permute(1, 0, 2)
obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0).permute(1, 0, 2)
obj_embedding_level11 = obj_embedding_level.repeat(1, frame_num11, 1)
obj_embedding_level12 = obj_embedding_level.repeat(1, frame_num12, 1)
obj_embedding_person11 = obj_embedding_person.repeat(1, frame_num11, 1)
obj_embedding_person12 = obj_embedding_person.repeat(1, frame_num12, 1)
hidden_states_cont1 = self.audio_feature_map_cont(hidden_states_cont1)
hidden_states_emo11_832 = self.audio_feature_map_emo(hidden_states_emo1)
hidden_states_emo11_256 = self.relu(self.audio_feature_map_emo2(hidden_states_emo11_832))
hidden_states11 = torch.cat(
[hidden_states_cont1, hidden_states_emo11_256, obj_embedding_level11, obj_embedding_person11], dim=2)
hidden_states_cont12 = self.audio_feature_map_cont(hidden_states_cont12)
hidden_states_emo12_832 = self.audio_feature_map_emo(hidden_states_emo2)
hidden_states_emo12_256 = self.relu(self.audio_feature_map_emo2(hidden_states_emo12_832))
hidden_states12 = torch.cat(
[hidden_states_cont12, hidden_states_emo12_256, obj_embedding_level12, obj_embedding_person12], dim=2)
if data["target11"].shape[0] == 1:
tgt_mask11 = self.biased_mask1[:, :hidden_states11.shape[1], :hidden_states11.shape[1]].clone().detach().to(
device=self.device)
tgt_mask22 = self.biased_mask1[:, :hidden_states12.shape[1], :hidden_states12.shape[1]].clone().detach().to(
device=self.device)
memory_mask11 = enc_dec_mask(self.device, hidden_states11.shape[1], hidden_states11.shape[1])
memory_mask12 = enc_dec_mask(self.device, hidden_states12.shape[1], hidden_states12.shape[1])
bs_out11 = self.transformer_decoder(hidden_states11, hidden_states_emo11_832, tgt_mask=tgt_mask11,
memory_mask=memory_mask11)
bs_out12 = self.transformer_decoder(hidden_states12, hidden_states_emo12_832, tgt_mask=tgt_mask22,
memory_mask=memory_mask12)
bs_output11 = self.bs_map_r(bs_out11)
bs_output12 = self.bs_map_r(bs_out12)
return bs_output11, bs_output12, label1
def predict(self, audio, level, person):
frame_num11 = math.ceil(audio.shape[1] / 16000 * 30)
inputs12 = self.processor(torch.squeeze(audio), sampling_rate=16000, return_tensors="pt",
padding="longest").input_values.to(self.device)
hidden_states_cont1 = self.audio_encoder_cont(inputs12, frame_num=frame_num11).last_hidden_state
inputs12 = self.feature_extractor(torch.squeeze(audio), sampling_rate=16000, padding=True,
return_tensors="pt").input_values.to(self.device)
output_emo1 = self.audio_encoder_emo(inputs12, frame_num=frame_num11)
hidden_states_emo1 = output_emo1.hidden_states
onehot_level = self.one_hot_level[level]
onehot_level = torch.from_numpy(onehot_level).to(self.device).float()
onehot_person = self.one_hot_person[person]
onehot_person = torch.from_numpy(onehot_person).to(self.device).float()
if audio.shape[0] == 1:
obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0)
obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0)
else:
obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0).permute(1, 0, 2)
obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0).permute(1, 0, 2)
obj_embedding_level11 = obj_embedding_level.repeat(1, frame_num11, 1)
obj_embedding_person11 = obj_embedding_person.repeat(1, frame_num11, 1)
hidden_states_cont1 = self.audio_feature_map_cont(hidden_states_cont1)
hidden_states_emo11_832 = self.audio_feature_map_emo(hidden_states_emo1)
hidden_states_emo11_256 = self.relu(
self.audio_feature_map_emo2(hidden_states_emo11_832))
hidden_states11 = torch.cat(
[hidden_states_cont1, hidden_states_emo11_256, obj_embedding_level11, obj_embedding_person11], dim=2)
if audio.shape[0] == 1:
tgt_mask11 = self.biased_mask1[:, :hidden_states11.shape[1],
:hidden_states11.shape[1]].clone().detach().to(device=self.device)
memory_mask11 = enc_dec_mask(self.device, hidden_states11.shape[1], hidden_states11.shape[1])
bs_out11 = self.transformer_decoder(hidden_states11, hidden_states_emo11_832, tgt_mask=tgt_mask11,
memory_mask=memory_mask11)
bs_output11 = self.bs_map_r(bs_out11)
return bs_output11

103
emotalk_own/readme.md Normal file
View File

@@ -0,0 +1,103 @@
![Psyche AI Inc release](./media/psy_logo.png)
# EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation [ICCV2023]
Official PyTorch implementation for the paper:
> **EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation**, ***ICCV 2023***.
>
> Ziqiao Peng, Haoyu Wu, Zhenbo Song, Hao Xu, Xiangyu Zhu, Jun He, Hongyan Liu, Zhaoxin Fan
>
> [Arxiv](https://arxiv.org/abs/2303.11089) | [Project Page](https://ziqiaopeng.github.io/emotalk/) | [License](https://github.com/psyai-net/EmoTalk_release/blob/main/LICENSE)
<p align="center">
<img src="./media/emotalk.png" width="90%" />
</p>
> Given audio input expressing different emotions, EmoTalk produces realistic 3D facial animation sequences with corresponding emotional expressions as outputs.
## News
- `2023.10.17` Thanks to [noirmist](https://github.com/noirmist)! Now you can create the environment via docker.
## Environment
- Linux
- Python 3.8.8
- Pytorch 1.12.1
- CUDA 11.3
- Blender 3.4.1
- ffmpeg 4.4.1
Clone the repo:
```bash
git clone https://github.com/psyai-net/EmoTalk_release.git
cd EmoTalk_release
```
Create conda environment:
```bash
conda create -n emotalk python=3.8.8
conda activate emotalk
pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
pip install -r requirements.txt
```
## **Demo**
Download Blender and put it in this directory.
```bash
wget https://ftp.nluug.nl/pub/graphics/blender/release/Blender3.4/blender-3.4.1-linux-x64.tar.xz
tar -xf blender-3.4.1-linux-x64.tar.xz
mv blender-3.4.1-linux-x64 blender && rm blender-3.4.1-linux-x64.tar.xz
```
Download the pretrained models from [EmoTalk.pth](https://drive.google.com/file/d/1KQZ-WGI9VDFLqgNXvJQosKVCbjTaCPqK/view?usp=drive_link) (Updated). Put the pretrained models under `pretrain_model` folder.
Put the audio under `aduio` folder and run
```bash
python demo.py --wav_path "./audio/disgust.wav"
```
The generated animation will be saved in `result` folder.
## **Dataset**
If someone wants to download the 3D-ETF dataset, please fill in the [agreement](https://drive.google.com/file/d/1AQ5_focSgw9WiJdA2R44BQOrdTUe2ABd/view?usp=drive_link), and use the education mailbox to email Ziqiao Peng (pengziqiao@ruc.edu.cn) and cc Zhaoxin Fan (fanzhaoxin@psyai.net) to request the download link.
## **Citation**
If you find this work useful for your research, please cite our paper:
```
@InProceedings{Peng_2023_ICCV,
author = {Peng, Ziqiao and Wu, Haoyu and Song, Zhenbo and Xu, Hao and Zhu, Xiangyu and He, Jun and Liu, Hongyan and Fan, Zhaoxin},
title = {EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation},
booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
month = {October},
year = {2023},
pages = {20687-20697}
}
```
## **Acknowledgement**
Here are some great resources we benefit:
- [Faceformer](https://github.com/EvelynFan/FaceFormer) for training pipeline
- [EVP](https://github.com/jixinya/EVP) for training dataloader
- [Speech-driven-expressions](https://github.com/YoungSeng/Speech-driven-expressions) for rendering
- [Wav2Vec2 Content](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english) and [Wav2Vec2 Emotion](https://huggingface.co/r-f/wav2vec-english-speech-emotion-recognition) for audio encoder
- [Head Template](http://filmicworlds.com/blog/solving-face-scans-for-arkit/) for visualization.
Thanks to John Hable for sharing his head template under the CC0 license, which is very helpful for us to visualize the results.
## **Contact**
For research purpose, such as comparison of experimental results, please contact pengziqiao@ruc.edu.cn
For commercial licensing, please contact fanzhaoxin@psyai.net
## **License**
This project is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. Please read the [LICENSE](LICENSE) file for more information.
## **Invitation**
We invite you to join [Psyche AI Inc](https://www.psyai.com/home) to conduct cutting-edge research and business implementation together. At Psyche AI Inc, we are committed to pushing the boundaries of what's possible in the fields of artificial intelligence and computer vision, especially their applications in avatars. As a member of our team, you will have the opportunity to collaborate with talented individuals, innovate new ideas, and contribute to projects that have a real-world impact.
If you are passionate about working on the forefront of technology and making a difference, we would love to hear from you. Please visit our website at [Psyche AI Inc](https://www.psyai.com/home) to learn more about us and to apply for open positions. You can also contact us by fanzhaoxin@psyai.net.
Let's shape the future together!!

87
emotalk_own/render.py Normal file
View File

@@ -0,0 +1,87 @@
import bpy
import os
import numpy as np
import sys
filename = str(sys.argv[-1])
root_dir = str(sys.argv[-2])
model_bsList = ["browDownLeft",
"browDownRight",
"browInnerUp",
"browOuterUpLeft",
"browOuterUpRight",
"cheekPuff",
"cheekSquintLeft",
"cheekSquintRight",
"eyeBlinkLeft",
"eyeBlinkRight",
"eyeLookDownLeft",
"eyeLookDownRight",
"eyeLookInLeft",
"eyeLookInRight",
"eyeLookOutLeft",
"eyeLookOutRight",
"eyeLookUpLeft",
"eyeLookUpRight",
"eyeSquintLeft",
"eyeSquintRight",
"eyeWideLeft",
"eyeWideRight",
"jawForward",
"jawLeft",
"jawOpen",
"jawRight",
"mouthClose",
"mouthDimpleLeft",
"mouthDimpleRight",
"mouthFrownLeft",
"mouthFrownRight",
"mouthFunnel",
"mouthLeft",
"mouthLowerDownLeft",
"mouthLowerDownRight",
"mouthPressLeft",
"mouthPressRight",
"mouthPucker",
"mouthRight",
"mouthRollLower",
"mouthRollUpper",
"mouthShrugLower",
"mouthShrugUpper",
"mouthSmileLeft",
"mouthSmileRight",
"mouthStretchLeft",
"mouthStretchRight",
"mouthUpperUpLeft",
"mouthUpperUpRight",
"noseSneerLeft",
"noseSneerRight",
"tongueOut"]
obj = bpy.data.objects["face"]
bpy.context.scene.render.engine = 'BLENDER_WORKBENCH'
bpy.context.scene.display.shading.light = 'MATCAP'
bpy.context.scene.display.render_aa = 'FXAA'
bpy.context.scene.render.resolution_x = int(512)
bpy.context.scene.render.resolution_y = int(768)
bpy.context.scene.render.fps = 30
bpy.context.scene.render.image_settings.file_format = 'PNG'
cam = bpy.data.objects['Camera']
cam.scale = [2, 2, 2]
bpy.context.scene.camera = cam
output_dir = root_dir + filename
blendshape_path = root_dir + filename + '.npy'
result = []
bs = np.load(blendshape_path)
for i in range(bs.shape[0]):
curr_bs = bs[i]
for j in range(52):
obj.data.shape_keys.key_blocks[model_bsList[j]].value = curr_bs[j]
bpy.context.scene.render.filepath = os.path.join(output_dir, '{}.png'.format(i))
bpy.ops.render.render(write_still=True)

View File

@@ -0,0 +1,5 @@
numpy~=1.21.6
transformers~=4.26.0
tqdm~=4.64.1
librosa~=0.10.0
scipy~=1.9.1

39
emotalk_own/utils.py Normal file
View File

@@ -0,0 +1,39 @@
# Borrowed from https://github.com/EvelynFan/FaceFormer/blob/main/faceformer.py
import torch
import math
# Temporal Bias
def init_biased_mask(n_head, max_seq_len, period):
def get_slopes(n):
def get_slopes_power_of_2(n):
start = (2 ** (-2 ** -(math.log2(n) - 3)))
ratio = start
return [start * ratio ** i for i in range(n)]
if math.log2(n).is_integer():
return get_slopes_power_of_2(n)
else:
closest_power_of_2 = 2 ** math.floor(math.log2(n))
return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
:n - closest_power_of_2]
slopes = torch.Tensor(get_slopes(n_head))
bias = torch.arange(start=0, end=max_seq_len, step=period).unsqueeze(1).repeat(1, period).view(-1) // (period)
bias = - torch.flip(bias, dims=[0])
alibi = torch.zeros(max_seq_len, max_seq_len)
for i in range(max_seq_len):
alibi[i, :i + 1] = bias[-(i + 1):]
alibi = slopes.unsqueeze(1).unsqueeze(1) * alibi.unsqueeze(0)
mask = (torch.triu(torch.ones(max_seq_len, max_seq_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
mask = mask.unsqueeze(0) + alibi
return mask
# Alignment Bias
def enc_dec_mask(device, T, S):
mask = torch.ones(T, S).to(device)
for i in range(T):
mask[i, i] = 0
return (mask == 1).to(device=device)

245
emotalk_own/wav2vec.py Executable file
View File

@@ -0,0 +1,245 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from dataclasses import dataclass
from transformers import Wav2Vec2Model, Wav2Vec2PreTrainedModel
from transformers.modeling_outputs import BaseModelOutput
from typing import Optional, Tuple
from transformers.file_utils import ModelOutput
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
_CONFIG_FOR_DOC = "Wav2Vec2Config"
_HIDDEN_STATES_START_POSITION = 2
# the implementation of Wav2Vec2Model is borrowed from https://huggingface.co/transformers/_modules/transformers/models/wav2vec2/modeling_wav2vec2.html#Wav2Vec2Model
# initialize our encoder with the pre-trained wav2vec 2.0 weights.
def _compute_mask_indices(
shape: Tuple[int, int],
mask_prob: float,
mask_length: int,
attention_mask: Optional[torch.Tensor] = None,
min_masks: int = 0,
) -> np.ndarray:
bsz, all_sz = shape
mask = np.full((bsz, all_sz), False)
all_num_mask = int(
mask_prob * all_sz / float(mask_length)
+ np.random.rand()
)
all_num_mask = max(min_masks, all_num_mask)
mask_idcs = []
padding_mask = attention_mask.ne(1) if attention_mask is not None else None
for i in range(bsz):
if padding_mask is not None:
sz = all_sz - padding_mask[i].long().sum().item()
num_mask = int(
mask_prob * sz / float(mask_length)
+ np.random.rand()
)
num_mask = max(min_masks, num_mask)
else:
sz = all_sz
num_mask = all_num_mask
lengths = np.full(num_mask, mask_length)
if sum(lengths) == 0:
lengths[0] = min(mask_length, sz - 1)
min_len = min(lengths)
if sz - min_len <= num_mask:
min_len = sz - num_mask - 1
mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])])
mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
min_len = min([len(m) for m in mask_idcs])
for i, mask_idc in enumerate(mask_idcs):
if len(mask_idc) > min_len:
mask_idc = np.random.choice(mask_idc, min_len, replace=False)
mask[i, mask_idc] = True
return mask
# linear interpolation layer
def linear_interpolation(features, input_fps, output_fps, output_len=None):
features = features.transpose(1, 2)
seq_len = features.shape[2] / float(input_fps)
if output_len is None:
output_len = int(seq_len * output_fps)
output_features = F.interpolate(features, size=output_len, align_corners=True, mode='linear')
return output_features.transpose(1, 2)
class Wav2Vec2Model(Wav2Vec2Model):
def __init__(self, config):
super().__init__(config)
self.lm_head = nn.Linear(1024, 32)
def forward(
self,
input_values,
attention_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
frame_num=None
):
self.config.output_attentions = True
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
hidden_states = self.feature_extractor(input_values)
hidden_states = hidden_states.transpose(1, 2)
hidden_states = linear_interpolation(hidden_states, 50, 30, output_len=frame_num)
if attention_mask is not None:
output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
attention_mask = torch.zeros(
hidden_states.shape[:2], dtype=hidden_states.dtype, device=hidden_states.device
)
attention_mask[
(torch.arange(attention_mask.shape[0], device=hidden_states.device), output_lengths - 1)
] = 1
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
hidden_states = self.feature_projection(hidden_states)[0]
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = encoder_outputs[0]
if not return_dict:
return (hidden_states,) + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@dataclass
class SpeechClassifierOutput(ModelOutput):
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
class Wav2Vec2ClassificationHead(nn.Module):
"""Head for wav2vec classification task."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.final_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.pooling_mode = config.pooling_mode
self.config = config
self.wav2vec2 = Wav2Vec2Model(config)
self.classifier = Wav2Vec2ClassificationHead(config)
self.init_weights()
def freeze_feature_extractor(self):
self.wav2vec2.feature_extractor._freeze_parameters()
def merged_strategy(
self,
hidden_states,
mode="mean"
):
if mode == "mean":
outputs = torch.mean(hidden_states, dim=1)
elif mode == "sum":
outputs = torch.sum(hidden_states, dim=1)
elif mode == "max":
outputs = torch.max(hidden_states, dim=1)[0]
else:
raise Exception(
"The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
return outputs
def forward(
self,
input_values,
attention_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
labels=None,
frame_num=None,
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.wav2vec2(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
hidden_states1 = linear_interpolation(hidden_states, 50, 30, output_len=frame_num)
hidden_states = self.merged_strategy(hidden_states1, mode=self.pooling_mode)
logits = self.classifier(hidden_states)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SpeechClassifierOutput(
loss=loss,
logits=logits,
hidden_states=hidden_states1,
attentions=outputs.attentions,
)