feat: 处理视频数字人说话状态，静止状态衔接问题

8a378618 · ali · 0a290374 · 8a378618 · 8a378618 · 8a378618
Commit 8a378618 authored Jan 04, 2024 by ali
Showing with 118 additions and 69 deletions

say.mov src/renderer/public/suhe/say.mov +0 -0

silence.mov src/renderer/public/suhe/silence.mov +0 -0

ShowVideo.vue src/renderer/screens/ShowVideo.vue +114 -65

video.ts src/renderer/store/video.ts +4 -4

No files found.
--- a/src/renderer/public/suhe/say.mov
+++ b/src/renderer/public/suhe/say.mov
--- a/src/renderer/public/suhe/wait.mov
+++ b/src/renderer/public/suhe/wait.mov
--- a/src/renderer/screens/ShowVideo.vue
+++ b/src/renderer/screens/ShowVideo.vue
 <!-- eslint-disable no-unused-vars -->
 <!-- eslint-disable camelcase -->
 <script setup lang="ts">
-import { onMounted, ref } from 'vue'
+import { nextTick, onMounted, ref } from 'vue'
 import { useRoute, useRouter } from 'vue-router'
-import type {
-  ServerMessagePartialResult,
-  ServerMessageResult,
-  Model
-} from '@/renderer/plugins/asr/index'
-import { audioAiTTS, localTTS } from '../plugins/tts'
 import useStore from '@/renderer/store'
+import { guid } from '@/renderer/utils/index'

 const router = useRouter()
 const route = useRoute()
@@ -22,15 +17,11 @@ const recordVolume = ref(0)
 const url = route.query.url as string
 const role = useVideo.list.find((i) => i.url === url)
 const microphoneState = ref<'waitInput' | 'input' | 'loading' | 'disabled' | 'reply'>('waitInput')
-const videoElement = ref<HTMLVideoElement | null>(null)
-const videoElement2 = ref<HTMLVideoElement | null>(null)
-const videos = [videoElement, videoElement2]
 const inputContext: {
  mediaStream?: MediaStream
  audioContext?: AudioContext
  audioContext2?: AudioContext
  scriptProcessorNode?: ScriptProcessorNode
-  model?: Model
  ws?: WebSocket
  voskWs?: WebSocket
  asrPartial: string
@@ -43,8 +34,18 @@ const inputContext: {
  ttsAudios: []
 }

+const videoElements = {
+  silence: createVideo(role?.url as string),
+  say: createVideo(role?.say as string),
+ }
+const can = ref<HTMLCanvasElement | null>(null)
+let videoElement: HTMLVideoElement | null = null;
+
 onMounted(() => {
-  // init();
+  init().catch((error) => {
+    microphoneState.value = 'waitInput'
+    showError(`init：${error}`)
+  })
 })

 router.beforeEach((g) => {
@@ -58,6 +59,41 @@ const showError = (msg: string) => {
  errorMsg.value = msg
 }

+function drawFrame(
+  ctx: CanvasRenderingContext2D,
+  video: HTMLVideoElement,
+) {
+  ctx.canvas.width = video.videoWidth
+  ctx.canvas.height = video.videoHeight
+  ctx.clearRect(0, 0, video.videoWidth, video.videoHeight)
+  ctx.drawImage(video, 0, 0, video.videoWidth, video.videoHeight)
+}
+
+
+async function init() {
+  const ctx = can.value?.getContext('2d');
+  if (!ctx) return
+
+  videoElement = videoElements.silence.ele
+  await videoElements.silence.load
+  // ctx.canvas.width = videoElement.videoWidth
+  // ctx.canvas.height = videoElement.videoHeight
+  videoElement.play();
+
+  const fps = 1000 / 30
+  let lastTime = Date.now()
+  const updateFrame = () => {
+    if (Date.now() - lastTime > fps) {
+      videoElement && drawFrame(ctx, videoElement)
+      lastTime = Date.now()
+    }
+    requestAnimationFrame(updateFrame)
+  }
+  requestAnimationFrame(updateFrame)
+
+}
+
+
 function analyzeMicrophoneVolume(stream: MediaStream, callback: (number) => void) {
  const audioContext = new AudioContext()
  const analyser = audioContext.createAnalyser()
@@ -187,7 +223,7 @@ async function llmEnd() {
    })
  ).json()

-  console.log('---------------->', resp)
+  console.log('----------------> llmEnd: ', resp)
 }

 async function endAudioInput() {
@@ -198,31 +234,55 @@ async function endAudioInput() {
  inputContext.audioContext?.close()
  inputContext.audioContext2?.close()
  inputContext.scriptProcessorNode && (inputContext.scriptProcessorNode.onaudioprocess = null)
-  inputContext.model?.terminate()
  if (inputContext.voskWs) {
    inputContext.voskWs.send('{"eof" : 1}')
    inputContext.voskWs.close()
  }
  inputContext.ttsAudios.length = 0
  inputContext.playingAudio?.pause()
-  videos[1].value?.pause()
-  videos[0].value?.pause()
+  toggleVideo(videoElements.silence.ele)
  isPlayRunning = false
 }

-const canplay = () => {
-  videos[1].value!.style.opacity = '1'
-  videos[0].value!.style.opacity = '0'
-  videos[0].value!.pause()
-  videos[1].value!.play()
-  videos[1].value!.removeEventListener('canplay', canplay)
-  videos.unshift(videos.pop()!)
-}
+function createVideo(url: string) {
+  const video = document.createElement('video');
+
+  if (url === role?.url || url === role?.say) {
+    video.loop = true;
+    video.muted = true;
+  } else {
+    video.loop = false;
+    video.muted = false;
+  }

-function loadVideo(url: string) {
-  videos[1].value!.src = url
-  videos[1].value!.style.opacity = '0'
-  videos[1].value!.addEventListener('canplay', canplay)
+  video.style.display = 'none';
+
+  const load = new Promise<void>((resolve, reject) => {
+    video.oncanplay = () => {
+      video.oncanplay = null;
+      resolve()
+      video.currentTime = 2;
+      document.body.appendChild(video);
+    };
+    video.onerror = reject;
+  })
+  video.src = url;
+
+  return {
+    ele: video,
+    load
+  };
+}
+function toggleVideo(ele: HTMLVideoElement) {
+  videoElement?.pause();
+  videoElement && (videoElement.currentTime = 0.1);
+  if (videoElement && videoElement !== videoElements.silence.ele && videoElement !== videoElements.say.ele) {
+    document.body.removeChild(videoElement);
+  }
+  ele.currentTime = 0.1;
+  ele.pause();
+  videoElement = ele
+  videoElement.play()
 }

 async function qNLP(question: string) {
@@ -258,19 +318,21 @@ async function onQ(question: string) {
  try {
    const nlpUrl = await qNLP(question)
    if (nlpUrl) {
-      loadVideo(nlpUrl)
+      const { ele, load } = createVideo(nlpUrl)
      microphoneState.value = 'reply'
-      const videoEle = videos[1].value
-      videoEle!.loop = false
-      videoEle!.muted = false
-      videoEle!.onended = () => {
-        videoEle!.onended = null
+      await load;
+      // 防止切换视频渲染黑屏帧
+      await new Promise(resolve => setTimeout(resolve, 200))
+      toggleVideo(ele)
+      ele.onended = () => {
+        toggleVideo(videoElements.silence.ele)
        microphoneState.value = 'input'
        // TODO: 是否需要初始化
      }
      return
    }
  } catch (error) {
+    console.error(error);
    microphoneState.value = 'input'
    showError(`nlp：${error}`)
    return
@@ -287,15 +349,15 @@ async function onQ(question: string) {

 async function llmLoop(question: string) {
  if (!role) return
-  microphoneState.value = 'loading'

+  const sessionId = guid()
  const resp = await (
    await fetch(`${settings.llmUrl}/api/v1/generate`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json'
      },
-      body: JSON.stringify({ question }),
+      body: JSON.stringify({ generator_id: sessionId, question }),
      mode: 'cors'
    })
  ).json()
@@ -313,6 +375,7 @@ async function llmLoop(question: string) {
      break
    }

+    // 轮询间隔时间
    await new Promise((resolve) => setTimeout(resolve, 100))

    const { results } = await (
@@ -322,18 +385,18 @@ async function llmLoop(question: string) {
          'Content-Type': 'application/json'
        },
        mode: 'cors',
-        body: JSON.stringify({ question })
+        body: JSON.stringify({ generator_id: sessionId, question })
      })
    ).json()

    const audioList = results[0].audio_list as string[]
    if (audioList.length === 0) continue
-    const isEnd = audioList.at(-1) === 'stream_end'
+    inputContext.llmEnd = audioList.at(-1) === 'stream_end';

-    if (isEnd) audioList.pop()
+    if (inputContext.llmEnd) audioList.pop()

    const newList = audioList.slice(index)
-    if (newList.length === 0 && isEnd) break
+    if (newList.length === 0 && inputContext.llmEnd) return;
    if (newList.length === 0) continue

    for (let i = index; i < audioList.length; i++) {
@@ -358,18 +421,14 @@ async function llmLoop(question: string) {
    // TODO: test
    // inputContext.ttsAudios.push(
    //   ...newList.map((path) => {
-    //     const audio = new Audio(`http://192.168.1.57:6767/${path.split('\\').pop()}`)
+    //     const audio = new Audio(`http://10.90.120.45:6767/${path.split('\\').pop()}`)
    //     audio.load()
    //     return audio
    //   })
    // )

    runAudioPlay()
-
-    if (isEnd) break
  }
-
-  inputContext.llmEnd = true
 }

 let isPlayRunning = false
@@ -380,7 +439,7 @@ async function runAudioPlay() {
  const audio = inputContext.ttsAudios.shift()
  if (!audio) {
    isPlayRunning = false
-    videos[0].value!.pause()
+    toggleVideo(videoElements.silence.ele);
    inputContext.llmEnd && (microphoneState.value = 'input')
    return
  }
@@ -390,9 +449,12 @@ async function runAudioPlay() {
  }
  await audio.play()
  inputContext.playingAudio = audio
-  loadVideo(role!.playUrl)
-  videos[1].value!.loop = true
-  videos[1].value!.muted = true
+  videoElements.say.load.then(async () => {
+    if (videoElements.say.ele.paused) {
+      toggleVideo(videoElements.say.ele);
+      videoElements.say.ele.play()
+    }
+  });

  microphoneState.value = 'reply'
 }
@@ -416,8 +478,7 @@ async function down() {
    class="d-flex justify-center align-center"
    :style="{ background: '#000' }"
  >
-    <video id="videoElement" ref="videoElement" :src="url" class="video-ele active"></video>
-    <video id="videoElement2" ref="videoElement2" class="video-ele2"></video>
+    <canvas id="can" ref="can" style="width: 100%; height: 100%; aspect-ratio: 9/16; "></canvas>
  </div>

  <div class="voice">
@@ -465,7 +526,7 @@ async function down() {
    </v-chip>
  </div>

-  <v-snackbar v-model="errorSnackbar" multi-line :timeout="3000">
+  <v-snackbar v-model="errorSnackbar" multi-line :timeout="6000">
    {{ errorMsg }}

    <template #actions>
@@ -503,18 +564,6 @@ async function down() {
  border-radius: 36%;
 }

-.video-ele,
-.video-ele2 {
-  position: absolute;
-  width: 100%;
-  height: 100%;
-  opacity: 0;
-}
-.video-ele.active,
-.video-ele2.active {
-  opacity: 1;
-}
-
 .q-list {
  position: fixed;
  bottom: 0;
@@ -527,4 +576,4 @@ async function down() {
  cursor: pointer;
  margin: 0 6px;
 }
-</style>
+</style>
\ No newline at end of file
--- a/src/renderer/store/video.ts
+++ b/src/renderer/store/video.ts
@@ -2,7 +2,7 @@ import { defineStore } from 'pinia'

 type IVideo = {
  list: {
-    playUrl: string
+    say: string
    url: string
    poster: string
    name: string
@@ -17,9 +17,9 @@ const useVideoStore = defineStore('video', {
      list: [
        {
          url: new URL('/libai/wait.mp4', import.meta.url).href,
+          say: new URL('/libai/10.mp4', import.meta.url).href,
          poster: new URL('/libai/poster.jpg', import.meta.url).href,
          name: '李白',
-          playUrl: new URL('/libai/10.mp4', import.meta.url).href,
          qa: [
            {
              url: new URL('/libai/1.mp4', import.meta.url).href,
@@ -74,9 +74,9 @@ const useVideoStore = defineStore('video', {
          ]
        },
        {
-          url: new URL('/suhe/wait.mov', import.meta.url).href,
+          url: new URL('/suhe/silence.mov', import.meta.url).href,
+          say: new URL('/suhe/say.mov', import.meta.url).href,
          poster: new URL('/suhe/poster.jpg', import.meta.url).href,
-          playUrl: new URL('/suhe/5.mov', import.meta.url).href,
          name: '苏荷',
          qa: [
            {