feat: 视频数字人中断功能，修复照片数字人播放状态不精准，视频数字人nlp接入

229f45cf · ali · 83ceae1a · 229f45cf · 229f45cf · 229f45cf
Commit 229f45cf authored Dec 25, 2023 by ali
6 changed files
--- a/src/preload/index.ts
+++ b/src/preload/index.ts
@@ -34,6 +34,13 @@ contextBridge.exposeInMainWorld('mainApi', {
      throw new Error(`Receive failed: Unknown ipc channel name: ${channel}`)
    }
  },
+  receiveOnce: (channel: string, cbFunc: Function): void => {
+    if (rendererAvailChannels.includes(channel)) {
+      ipcRenderer.once(channel, (event, ...args) => cbFunc(event, ...args))
+    } else {
+      throw new Error(`Receive failed: Unknown ipc channel name: ${channel}`)
+    }
+  },
  invoke: async (channel: string, ...data: any[]): Promise<any> => {
    if (mainAvailChannels.includes(channel)) {
      const result = await ipcRenderer.invoke.apply(null, [channel, ...data])

--- a/src/renderer/components/layout/HeaderLayout.vue
+++ b/src/renderer/components/layout/HeaderLayout.vue
@@ -127,7 +127,7 @@ function clear() {
  <v-app-bar color="#d71b1b" density="compact" class="header">
    <template #append>
      <v-btn
-        prepend-icon="mdi-home"
+        prepend-icon="mdi-image-album"
        variant="text"
        :class="{ active: isCurrentRoute('/') }"
        @click="handleRoute('/')"
@@ -135,7 +135,7 @@ function clear() {
        {{ $t('menu.photo') }}
      </v-btn>
      <v-btn
-        prepend-icon="mdi-fit-to-screen-outline"
+        prepend-icon="mdi-video-account"
        variant="text"
        :class="{ active: isCurrentRoute('/video') }"
        @click="handleRoute('/video')"
@@ -230,6 +230,14 @@ function clear() {
                  :model-value="setting.llmUrl"
                ></v-text-field>

+                <v-text-field
+                  style="margin-top: 22px"
+                  label="NLP-HOST"
+                  :rules="[(value) => !!value || 'LNP 地址必填']"
+                  hide-details="auto"
+                  :model-value="setting.nlpHost"
+                ></v-text-field>
+
                <v-select
                  v-model="setting.liveHost.value"
                  style="margin-top: 22px"

--- a/src/renderer/plugins/live/HwWebRTC.ts
+++ b/src/renderer/plugins/live/HwWebRTC.ts
@@ -65,7 +65,7 @@ export class HwWebRTC extends EventEmitter {
  constructor(id: string, log: 'none' | 'error' | 'warn' | 'info' | 'debug' = 'none') {
    super()
    this.elementId = id
-    // setLogLevel(log);
+    window.HWLLSPlayer.setLogLevel(log);
  }

  /**

--- a/src/renderer/screens/ShowPhoto.vue
+++ b/src/renderer/screens/ShowPhoto.vue
@@ -55,7 +55,7 @@ async function init() {
  const item = photo.list.find((i) => i.url === url)
  photoRole = new PhotoRole(settings.liveHost, `${item?.liveUrl}`, canvasEle)

-  photoRole.on('asyncAnswer', (ans) => {
+  photoRole.on('asyncAnswer', async (ans) => {
    if (ans.playState === 'playing') {
      microphoneState.value = 'reply'
      return
@@ -64,8 +64,7 @@ async function init() {
    if (
      microphoneState.value === 'reply' &&
      ans.playState === 'pause' &&
-      photoRole!.taskQueueLength === 0 &&
-      answerArray.length === 0
+      await checkSteps()
    ) {
      microphoneState.value = 'input'
    }
@@ -220,7 +219,7 @@ async function startVoskWasmAudioInput() {
  microphoneState.value = 'loading'

  const { recognizer, channel } = await initVosk({
-    result: onAsr,
+    result: onQ,
    partialResult: (text) => {
      // console.log('----------------> partialResult:', text)
    }
@@ -347,12 +346,21 @@ function initVoskWS() {
      // if (parsed.result) console.log(parsed.result);
      if (parsed.text) {
        inputContext.asrPartial = parsed.text
-        onAsr(inputContext.asrPartial)
+        onQ(inputContext.asrPartial)
      }
    }
  })
 }

+function initLLMSocket(): Promise<WebSocket> {
+  const ws = new WebSocket(settings.llmUrl)
+  return new Promise((resolve, reject) => {
+    ws.onopen = () => resolve(ws)
+    ws.onerror = reject
+  })
+}
+
+
 function endAudioInput() {
  microphoneState.value = 'waitInput'
  inputContext.mediaStream?.getTracks().forEach((track) => track.stop())
@@ -367,7 +375,37 @@ function endAudioInput() {
 }

 const answerArray: { text: string; isLast: boolean }[] = []
-async function onAsr(question: string) {
+const steps: Promise<string>[] = [];
+const checkSteps = async () => {
+  let count = 0;
+  for (let i = 0; i < steps.length; i++) {
+
+    try {
+      const res = await Promise.race([steps[i], new Promise((resolve) => setTimeout(() => resolve(false), 10))])
+      if (res === false) continue;
+    } catch (e) {
+      console.error(e)
+    }
+
+    count ++;
+    if (count >= 2) {
+      return true
+    }
+  }
+
+  return false;
+}
+const createStep = () => {
+  let stepResolve: (string) => void = () => {};
+  let stepReject: (string) => void = () => {};
+  const pose = new Promise<string>((resolve, reject) => {
+    stepResolve = resolve;
+    stepReject = reject;
+  })
+
+  return { pose,  stepResolve, stepReject }
+}
+async function onQ(question: string) {
  console.log('---------------->question: ', question)

  microphoneState.value = 'loading'
@@ -380,6 +418,9 @@ async function onAsr(question: string) {
  let isTime = true
  let sliceAnswerLength = 10
  answerArray.length = 0
+  steps.length = 0;
+  const { pose, stepResolve, stepReject } = createStep();
+  steps.push(pose);
  photoRole!.answerArgs = new PhotoAnswer()

  ws.onmessage = (message) => {
@@ -396,6 +437,7 @@ async function onAsr(question: string) {
        runTTSTask(answerArray)
        inputContext.ws?.close()
        console.log('----------------> answer: ', answer)
+        stepResolve('chat');
        return
      }

@@ -419,7 +461,7 @@ async function onAsr(question: string) {
        }
      }
    } catch (error) {
-      console.log('返回答案错误 -----> ' + JSON.stringify(error))
+      stepReject(JSON.stringify(error))
    }
  }

@@ -427,33 +469,28 @@ async function onAsr(question: string) {
  ws.send(JSON.stringify({ prompt: question, historys_list: [] }))
 }

-function initLLMSocket(): Promise<WebSocket> {
-  const ws = new WebSocket(settings.llmUrl)
-  return new Promise((resolve, reject) => {
-    ws.onopen = () => resolve(ws)
-    ws.onerror = reject
-  })
-}
-
 let isTTSRunning = false
 async function runTTSTask(tasks: { text: string; isLast: boolean }[]) {
  if (isTTSRunning) return
  isTTSRunning = true

+  const { pose, stepResolve, stepReject } = createStep();
+  steps.push(pose);
+
  try {
    while (tasks.length) {
      const task = tasks.shift()
      if (!task) break
      if (task.text.trim().length < 1) continue

-      console.time(task + ' TTS: ')
+      console.time(task.text + ' TTS: ')
      const res = await localTTS({
        url: settings.ttsHost,
        text: task.text,
        audio_path: settings.userData
      })
      console.log('----------------> TTS:', res[0].text)
-      console.timeEnd(task + ' TTS: ')
+      console.timeEnd(task.text + ' TTS: ')
      console.log('---------------->', res[0].text)

      const audioPath = await uploadFile({ filePath: res[0].text })
@@ -464,15 +501,16 @@ async function runTTSTask(tasks: { text: string; isLast: boolean }[]) {
      })
    }
  } catch (error) {
-    console.error(error)
+    stepReject(JSON.stringify(error))
  }

  isTTSRunning = false
+  stepResolve('TTS')
 }

 function uploadFile({ filePath }: { filePath: string }) {
  return new Promise<string>((resolve, reject) => {
-    window.mainApi.receive(
+    window.mainApi.receiveOnce(
      'msgReceivedFileUploadResponse',
      (event: Event, result: { code: number; data: null | { filename: string } }) => {
        if (result.code !== 200) {

--- a/src/renderer/screens/ShowVideo.vue
+++ b/src/renderer/screens/ShowVideo.vue
@@ -25,6 +25,19 @@ const microphoneState = ref<'waitInput' | 'input' | 'loading' | 'disabled' | 're
 const videoElement = ref<HTMLVideoElement | null>(null)
 const videoElement2 = ref<HTMLVideoElement | null>(null)
 const videos = [videoElement, videoElement2]
+const inputContext: {
+  mediaStream?: MediaStream
+  audioContext?: AudioContext
+  audioContext2?: AudioContext
+  scriptProcessorNode?: ScriptProcessorNode
+  model?: Model
+  ws?: WebSocket
+  voskWs?: WebSocket
+  asrPartial: string
+  playingAudio?: HTMLAudioElement
+} = {
+  asrPartial: ''
+}

 onMounted(() => {
  // init();
@@ -87,19 +100,6 @@ function analyzeMicrophoneVolume(stream: MediaStream, callback: (number) => void
  inputContext.scriptProcessorNode = recordEventNode
 }

-const inputContext: {
-  mediaStream?: MediaStream
-  audioContext?: AudioContext
-  audioContext2?: AudioContext
-  scriptProcessorNode?: ScriptProcessorNode
-  model?: Model
-  ws?: WebSocket
-  voskWs?: WebSocket
-  asrPartial: string
-} = {
-  asrPartial: ''
-}
-
 async function startVoskWasmAudioInput() {
  if (microphoneState.value === 'loading') return

@@ -111,7 +111,7 @@ async function startVoskWasmAudioInput() {
  microphoneState.value = 'loading'

  const { recognizer, channel } = await initVosk({
-    result: onAsr,
+    result: onQ,
    partialResult: (text) => {
      // console.log('----------------> partialResult:', text)
    }
@@ -238,12 +238,20 @@ function initVoskWS() {
      // if (parsed.result) console.log(parsed.result);
      if (parsed.text) {
        inputContext.asrPartial = parsed.text
-        onAsr(inputContext.asrPartial)
+        onQ(inputContext.asrPartial)
      }
    }
  })
 }

+function initLLMSocket(): Promise<WebSocket> {
+  const ws = new WebSocket(settings.llmUrl)
+  return new Promise((resolve, reject) => {
+    ws.onopen = () => resolve(ws)
+    ws.onerror = reject
+  })
+}
+
 function endAudioInput() {
  microphoneState.value = 'waitInput'
  inputContext.mediaStream?.getTracks().forEach((track) => track.stop())
@@ -255,6 +263,11 @@ function endAudioInput() {
    inputContext.voskWs.send('{"eof" : 1}')
    inputContext.voskWs.close()
  }
+  ttsAudios.length = 0;
+  inputContext.playingAudio?.pause();
+  videos[1].value?.pause();
+  videos[0].value?.pause();
+  isPlayRunning = false;
 }

 const canplay = () => {
@@ -272,28 +285,46 @@ function loadVideo(url: string) {
  videos[1].value!.addEventListener('canplay', canplay)
 }

-async function onAsr(question: string) {
-  console.log('---------------->', question)
+async function qNLP(question: string) {
+  const resp = await (await fetch(`${settings.nlpHost}/api/v1/generate`, {
+    headers: {
+      accept: 'application/json, text/plain, */*',
+      'content-type': 'application/json'
+    },
+    body: JSON.stringify({
+      question
+    }),
+    method: 'POST',
+    mode: 'cors'
+  })).json() as { results: {text: null | string}[] };
+
+  if (resp.results[0].text === null) return '';
+
+  for (let i = 0; i < role!.qa.length; i++) {
+    const { q, url } = role!.qa[i]
+    if (q.includes(resp.results[0].text)) {
+      return url;
+    };
+  }
+}
+
+async function onQ(question: string) {
+  console.log('----------------> Asr:', question)
  if (!role) return
  microphoneState.value = 'loading'

-  question = question.replace(/\s/g, '')
-  for (let i = 0; i < role.qa.length; i++) {
-    const { q, url } = role.qa[i]
-    console.log(question + ' : ' + q)
-    if (q.includes(question)) {
-      loadVideo(url)
-      microphoneState.value = 'reply'
-      const videoEle = videos[1].value
-      videoEle!.loop = false
-      videoEle!.muted = false
-      videoEle!.onended = () => {
-        videoEle!.onended = null
-        microphoneState.value = 'input'
-        // 是否需要初始化
-      }
-      return
+  if (await qNLP(question)) {
+    loadVideo(url)
+    microphoneState.value = 'reply'
+    const videoEle = videos[1].value
+    videoEle!.loop = false
+    videoEle!.muted = false
+    videoEle!.onended = () => {
+      videoEle!.onended = null
+      microphoneState.value = 'input'
+      // TODO: 是否需要初始化
    }
+    return
  }

  // 视频链接匹配不上，直接走大模型
@@ -306,6 +337,10 @@ async function onAsr(question: string) {
  inputContext.ws = ws

  ws.onmessage = (message) => {
+    if (microphoneState.value === 'input') {
+      return;
+    }
+
    try {
      const { text, event } = JSON.parse(message.data) as {
        event: string
@@ -348,23 +383,16 @@ async function onAsr(question: string) {
    }
  }

-  console.log('----------------> Asr:', question)
  ws.send(JSON.stringify({ prompt: question, historys_list: [] }))
 }

-function initLLMSocket(): Promise<WebSocket> {
-  const ws = new WebSocket(settings.llmUrl)
-  return new Promise((resolve, reject) => {
-    ws.onopen = () => resolve(ws)
-    ws.onerror = reject
-  })
-}
-
 let isTTSRunning = false
 async function runTTSTask(tasks: string[]) {
  if (isTTSRunning) return
  isTTSRunning = true

+  microphoneState.value = 'loading'
+
  try {
    while (tasks.length) {
      const task = tasks.shift()
@@ -373,7 +401,6 @@ async function runTTSTask(tasks: string[]) {

      console.time(task + ' TTS: ')

-      microphoneState.value = 'loading'
      const res = await localTTS({
        url: settings.ttsHost,
        text: task,
@@ -383,6 +410,12 @@ async function runTTSTask(tasks: string[]) {
      console.log('----------------> TTS:', res[0].text)
      console.timeEnd(task + ' TTS: ')

+      // @ts-ignore
+      if (microphoneState.value === 'input') {
+        break;
+      }
+
+
      const audio = new Audio(`file://${res[0].text}`)
      audio.load()
      ttsAudios.push(audio)
@@ -414,6 +447,7 @@ async function runAudioPlay() {
    runAudioPlay()
  }
  await audio.play()
+  inputContext.playingAudio = audio;
  loadVideo(role!.playUrl)
  videos[1].value!.loop = true
  videos[1].value!.muted = true
@@ -434,6 +468,14 @@ async function xfTTS(text: string) {
  })
  console.log('----------------> tts:', res)
 }
+
+function down() {
+  if (microphoneState.value === 'reply') {
+    endAudioInput();
+  }
+
+  startVoskWsAudioInput();
+}
 </script>

 <template>
@@ -454,15 +496,14 @@ async function xfTTS(text: string) {
      size="x-large"
      :disabled="
        microphoneState === 'loading' ||
-        microphoneState === 'disabled' ||
-        microphoneState === 'reply'
+        microphoneState === 'disabled'
      "
-      @pointerdown="startVoskWsAudioInput"
+      @pointerdown="down"
    >
      <v-icon v-if="microphoneState === 'waitInput'" icon="mdi-microphone"></v-icon>
      <v-icon v-if="microphoneState === 'loading'" icon="mdi-microphone-settings"></v-icon>
      <v-icon v-if="microphoneState === 'disabled'" icon="mdi-microphone-off"></v-icon>
-      <v-icon v-if="microphoneState === 'reply'" icon="mdi-message-reply-text-outline"></v-icon>
+      <v-icon v-if="microphoneState === 'reply'" icon="mdi-volume-high"></v-icon>

      <template v-if="microphoneState === 'input'">
        <img width="30" height="30" :src="iconMicrophone" alt="" srcset="" />
@@ -488,7 +529,7 @@ async function xfTTS(text: string) {
      color="white"
      variant="outlined"
      :disabled="microphoneState !== 'waitInput' && microphoneState !== 'input'"
-      @click="onAsr(item.q)"
+      @click="onQ(item.q)"
    >
      <v-icon start icon="mdi-help-circle-outline"></v-icon>
      {{ item.q }}

--- a/src/renderer/store/settings.ts
+++ b/src/renderer/store/settings.ts
@@ -30,6 +30,7 @@ export type ISettings = {
  voskWsLUrl: string
  liveHost: string
  vConsole: boolean
+  nlpHost
 }

 const useSettingsStore = defineStore('settings', {
@@ -67,7 +68,8 @@ const useSettingsStore = defineStore('settings', {
      llmToTTSSliceLength: 20,
      voskWsLUrl: 'ws://127.0.0.1:2700',
      liveHost: 'laihua',
-      vConsole: true
+      vConsole: true,
+      nlpHost: 'http://192.168.1.57:19001'
    }) as ISettings,
  getters: {},
  actions: {