initial commit

2024-04-24 06:57:30 +04:00
commit 3633aa99e5
29 changed files with 2555 additions and 0 deletions
--- a/miapia_own/pieinfer.py
+++ b/miapia_own/pieinfer.py
@@ -0,0 +1,153 @@
+import librosa
+import numpy as np
+import argparse
+
+from parse import parse
+from scipy.signal import savgol_filter
+import torch
+from model import EmoTalk
+import random
+import os, subprocess
+import shlex
+from munch import Munch
+
+
+@torch.no_grad()
+def test(model, speech_array, sampling_rate):
+    args = Munch(
+        bs_dim=52,
+        feature_dim=832,
+        period=30,
+        device="cuda",
+        model_path="./pretrain_model/EmoTalk.pth",
+        max_seq_len=5000,
+        num_workers=0,
+        batch_size=1,
+        post_processing=True,
+        blender_path="./blender/blender")
+
+    eye1 = np.array([0.36537236, 0.950235724, 0.95593375, 0.916715622, 0.367256105, 0.119113259, 0.025357503])
+    eye2 = np.array([0.234776169, 0.909951985, 0.944758058, 0.777862132, 0.191071674, 0.235437036, 0.089163929])
+    eye3 = np.array([0.870040774, 0.949833691, 0.949418545, 0.695911646, 0.191071674, 0.072576277, 0.007108896])
+    eye4 = np.array([0.000307991, 0.556701422, 0.952656746, 0.942345619, 0.425857186, 0.148335218, 0.017659493])
+    # speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000)
+    audio = torch.FloatTensor(speech_array).unsqueeze(0).to(args.device)
+    level = torch.tensor([1]).to(args.device)
+    person = torch.tensor([0]).to(args.device)
+    prediction = model.predict(audio, level, person)
+    prediction = prediction.squeeze().detach().cpu().numpy()
+    if args.post_processing:
+        output = np.zeros((prediction.shape[0], prediction.shape[1]))
+        for i in range(prediction.shape[1]):
+            output[:, i] = savgol_filter(prediction[:, i], 5, 2)
+        output[:, 8] = 0
+        output[:, 9] = 0
+        i = random.randint(0, 60)
+        while i < output.shape[0] - 7:
+            eye_num = random.randint(1, 4)
+            if eye_num == 1:
+                output[i:i + 7, 8] = eye1
+                output[i:i + 7, 9] = eye1
+            elif eye_num == 2:
+                output[i:i + 7, 8] = eye2
+                output[i:i + 7, 9] = eye2
+            elif eye_num == 3:
+                output[i:i + 7, 8] = eye3
+                output[i:i + 7, 9] = eye3
+            else:
+                output[i:i + 7, 8] = eye4
+                output[i:i + 7, 9] = eye4
+            time1 = random.randint(60, 180)
+            i = i + time1
+        return output
+    else:
+        return prediction
+
+
+def render_video(wav_name, model_name):
+    args = Munch(
+        bs_dim=52,
+        feature_dim=832,
+        period=30,
+        device="cuda",
+        model_path="./pretrain_model/EmoTalk.pth",
+        max_seq_len=5000,
+        num_workers=0,
+        batch_size=1,
+        post_processing=True,
+        blender_path="./blender/blender")
+
+    # wav_name = args.wav_path.split('/')[-1].split('.')[0]
+    image_path = os.path.join("./audio", wav_name)
+    os.makedirs(image_path, exist_ok=True)
+    blender_path = args.blender_path
+
+    python_path = f"./{model_name}.py"
+    blend_path = f"./{model_name}.blend"
+    print(python_path, blend_path)
+    # python_path = "./render.py"
+    # blend_path = "./render.blend"
+    cmd = '{} -t 64 -b {} -P {} -- "{}" "{}" '.format(blender_path,
+                                                      blend_path,
+                                                      python_path,
+                                                      "./audio/",
+                                                      wav_name)
+    cmd = shlex.split(cmd)
+    p = subprocess.Popen(cmd,
+                         shell=False,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.STDOUT)
+    while p.poll() is None:
+        line = p.stdout.readline().decode('utf-8')
+        line = line.strip()
+        if line and line.startswith('Saved: '):
+            fname = parse("Saved: '{}'", line).fixed[0]
+            yield fname
+        else:
+            print(line)
+
+    if p.returncode == 0:
+        print('Subprogram success')
+    else:
+        print('Subprogram failed')
+
+
+def construct_video(wav_name):
+    image_path = os.path.join("./audio", wav_name)
+    os.makedirs(image_path, exist_ok=True)
+    image_temp = image_path + "/%d.png"
+    output_path = os.path.join("./audio", wav_name + ".mp4")
+    cmd = 'ffmpeg -r 30 -i "{}" -i "{}" -pix_fmt yuv420p -s 512x768 "{}" -y'.format(image_temp,
+                                                                                    f"./audio/{wav_name}.wav",
+                                                                                    output_path)
+    subprocess.call(cmd, shell=True)
+    cmd = 'rm -rf "{}"'.format(image_path)
+    subprocess.call(cmd, shell=True)
+
+
+class PieInfer(object):
+    def __init__(self):
+        args = Munch(
+            bs_dim=52,
+            feature_dim=832,
+            period=30,
+            device="cuda",
+            model_path="./pretrain_model/EmoTalk.pth",
+            max_seq_len=5000,
+            num_workers=0,
+            batch_size=1,
+            post_processing=True,
+            blender_path="./blender/blender")
+        #"""
+        model = EmoTalk(args)
+        model.load_state_dict(torch.load(args.model_path, map_location=torch.device(args.device)), strict=False)
+        model = model.to(args.device)
+        model.eval()
+        #"""
+        # model = None
+        self.model = model
+
+    def __call__(self,
+                 speech_array,
+                 sampling_rate):
+        return test(self.model, speech_array, sampling_rate)