import librosa import numpy as np import argparse from parse import parse from scipy.signal import savgol_filter import torch from model import EmoTalk import random import os, subprocess import shlex from munch import Munch @torch.no_grad() def test(model, speech_array, sampling_rate): args = Munch( bs_dim=52, feature_dim=832, period=30, device="cuda", model_path="./pretrain_model/EmoTalk.pth", max_seq_len=5000, num_workers=0, batch_size=1, post_processing=True, blender_path="./blender/blender") eye1 = np.array([0.36537236, 0.950235724, 0.95593375, 0.916715622, 0.367256105, 0.119113259, 0.025357503]) eye2 = np.array([0.234776169, 0.909951985, 0.944758058, 0.777862132, 0.191071674, 0.235437036, 0.089163929]) eye3 = np.array([0.870040774, 0.949833691, 0.949418545, 0.695911646, 0.191071674, 0.072576277, 0.007108896]) eye4 = np.array([0.000307991, 0.556701422, 0.952656746, 0.942345619, 0.425857186, 0.148335218, 0.017659493]) # speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000) audio = torch.FloatTensor(speech_array).unsqueeze(0).to(args.device) level = torch.tensor([1]).to(args.device) person = torch.tensor([0]).to(args.device) prediction = model.predict(audio, level, person) prediction = prediction.squeeze().detach().cpu().numpy() if args.post_processing: output = np.zeros((prediction.shape[0], prediction.shape[1])) for i in range(prediction.shape[1]): output[:, i] = savgol_filter(prediction[:, i], 5, 2) output[:, 8] = 0 output[:, 9] = 0 i = random.randint(0, 60) while i < output.shape[0] - 7: eye_num = random.randint(1, 4) if eye_num == 1: output[i:i + 7, 8] = eye1 output[i:i + 7, 9] = eye1 elif eye_num == 2: output[i:i + 7, 8] = eye2 output[i:i + 7, 9] = eye2 elif eye_num == 3: output[i:i + 7, 8] = eye3 output[i:i + 7, 9] = eye3 else: output[i:i + 7, 8] = eye4 output[i:i + 7, 9] = eye4 time1 = random.randint(60, 180) i = i + time1 return output else: return prediction def render_video(wav_name, model_name): args = Munch( bs_dim=52, feature_dim=832, period=30, device="cuda", model_path="./pretrain_model/EmoTalk.pth", max_seq_len=5000, num_workers=0, batch_size=1, post_processing=True, blender_path="./blender/blender") # wav_name = args.wav_path.split('/')[-1].split('.')[0] image_path = os.path.join("./audio", wav_name) os.makedirs(image_path, exist_ok=True) blender_path = args.blender_path python_path = f"./{model_name}.py" blend_path = f"./{model_name}.blend" print(python_path, blend_path) # python_path = "./render.py" # blend_path = "./render.blend" cmd = '{} -t 64 -b {} -P {} -- "{}" "{}" '.format(blender_path, blend_path, python_path, "./audio/", wav_name) cmd = shlex.split(cmd) p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) while p.poll() is None: line = p.stdout.readline().decode('utf-8') line = line.strip() if line and line.startswith('Saved: '): fname = parse("Saved: '{}'", line).fixed[0] yield fname else: print(line) if p.returncode == 0: print('Subprogram success') else: print('Subprogram failed') def construct_video(wav_name): image_path = os.path.join("./audio", wav_name) os.makedirs(image_path, exist_ok=True) image_temp = image_path + "/%d.png" output_path = os.path.join("./audio", wav_name + ".mp4") cmd = 'ffmpeg -r 30 -i "{}" -i "{}" -pix_fmt yuv420p -s 512x768 "{}" -y'.format(image_temp, f"./audio/{wav_name}.wav", output_path) subprocess.call(cmd, shell=True) cmd = 'rm -rf "{}"'.format(image_path) subprocess.call(cmd, shell=True) class PieInfer(object): def __init__(self): args = Munch( bs_dim=52, feature_dim=832, period=30, device="cuda", model_path="./pretrain_model/EmoTalk.pth", max_seq_len=5000, num_workers=0, batch_size=1, post_processing=True, blender_path="./blender/blender") #""" model = EmoTalk(args) model.load_state_dict(torch.load(args.model_path, map_location=torch.device(args.device)), strict=False) model = model.to(args.device) model.eval() #""" # model = None self.model = model def __call__(self, speech_array, sampling_rate): return test(self.model, speech_array, sampling_rate)