feat: asr 采用 ws 方式，视频数字人遇到无法匹配直接转大模型

fec7d389 · ali · c1557511 · fec7d389 · fec7d389 · fec7d389
Commit fec7d389 authored Dec 05, 2023 by ali
Show whitespace changes
Inline Side-by-side

Showing with 195 additions and 16 deletions

ShowPhoto.vue src/renderer/screens/ShowPhoto.vue +2 -2

ShowVideo.vue src/renderer/screens/ShowVideo.vue +190 -13

settings.ts src/renderer/store/settings.ts +3 -1

No files found.
--- a/src/renderer/screens/ShowPhoto.vue
+++ b/src/renderer/screens/ShowPhoto.vue
@@ -250,7 +250,7 @@ function endAudioInput() {
 async function onAsr(question: string) {
  console.log('---------------->question: ', question)
  endAudioInput()
-  const ws = await initSocket()
+  const ws = await initLLMSocket()
  inputContext.ws = ws

  let sliceAnswer = ''
@@ -299,7 +299,7 @@ async function onAsr(question: string) {
  ws.send(JSON.stringify({ prompt: question, historys_list: [] }))
 }

-function initSocket(): Promise<WebSocket> {
+function initLLMSocket(): Promise<WebSocket> {
  const ws = new WebSocket(settings.llmUrl)
  return new Promise((resolve, reject) => {
    ws.onopen = () => resolve(ws)

--- a/src/renderer/screens/ShowVideo.vue
+++ b/src/renderer/screens/ShowVideo.vue
@@ -14,7 +14,8 @@ import useStore from '@/renderer/store'
 const router = useRouter()
 const route = useRoute()
 const { settings, video: useVideo } = useStore()
-const sampleRate = 48000
+let sampleRate = 48000
+const bufferSize = 8192;
 const iconMicrophone = new URL('/images/microphone-input.svg', import.meta.url).href

 const recordVolume = ref(0)
@@ -95,9 +96,13 @@ const inputContext: {
  scriptProcessorNode?: ScriptProcessorNode
  model?: Model
  ws?: WebSocket
-} = {}
+  voskWs?: WebSocket
+  asrPartial: string
+} = {
+  asrPartial: ''
+}

-async function startAudioInput() {
+async function startVoskWasmAudioInput() {
  if (microphoneState.value === 'loading') return

  if (microphoneState.value === 'input') {
@@ -114,6 +119,7 @@ async function startAudioInput() {
    }
  })

+  sampleRate = 48000
  const mediaStream = await navigator.mediaDevices.getUserMedia({
    video: false,
    audio: {
@@ -152,6 +158,83 @@ async function startAudioInput() {
  inputContext.audioContext = audioContext
 }

+async function startVoskWsAudioInput() {
+  if (microphoneState.value === 'loading') return
+
+  if (microphoneState.value === 'input') {
+    endAudioInput()
+    return
+  }
+
+  initVoskWS()
+  sampleRate = 8000
+  const mediaStream = await navigator.mediaDevices.getUserMedia({
+      audio: {
+          echoCancellation: true,
+          noiseSuppression: true,
+          channelCount: 1,
+          sampleRate
+      }, video: false
+  });
+
+  const audioContext = new AudioContext({sampleRate});
+  const source = audioContext.createMediaStreamSource(mediaStream);
+  const processor = audioContext.createScriptProcessor();
+  source.connect(processor);
+  processor.connect(audioContext.destination);
+
+  processor.onaudioprocess = (audioDataChunk) => postAudio(audioDataChunk);
+
+  await analyzeMicrophoneVolume(mediaStream, (val) => {
+    recordVolume.value = val
+  })
+
+  microphoneState.value = 'input'
+
+  inputContext.audioContext = audioContext
+  inputContext.mediaStream = mediaStream
+}
+
+function postAudio(audioDataChunk) {
+    if (!inputContext.voskWs) return;
+
+    if (inputContext.voskWs.readyState === WebSocket.OPEN) {
+        const inputData = audioDataChunk.inputBuffer.getChannelData(0) || new Float32Array(bufferSize);
+        const targetBuffer = new Int16Array(inputData.length);
+        for (let index = inputData.length; index > 0; index--) {
+            targetBuffer[index] = 32767 * Math.min(1, inputData[index]);
+        }
+        inputContext.voskWs.send(targetBuffer.buffer);
+    }
+}
+
+function initVoskWS() {
+  return new Promise((resolve, reject) => {
+    inputContext.voskWs = new WebSocket(settings.voskWsLUrl);
+    inputContext.voskWs.binaryType = "arraybuffer";
+    inputContext.asrPartial = '';
+
+    inputContext.voskWs.onopen = function (event) {
+      resolve(inputContext.voskWs);
+    };
+
+    inputContext.voskWs.onerror = function (event) {
+      reject(new Error(JSON.stringify(event)))
+    };
+
+    inputContext.voskWs.onmessage = function (event) {
+      if (!event.data) return
+      const parsed = JSON.parse(event.data);
+      if (parsed.partial && parsed.partial !== 'the') inputContext.asrPartial = parsed.partial+'|';
+      // if (parsed.result) console.log(parsed.result);
+      if (parsed.text) {
+        inputContext.asrPartial = parsed.text;
+        onAsr(inputContext.asrPartial);
+      };
+    };
+  })
+}
+
 function endAudioInput() {
  microphoneState.value = 'waitInput'
  inputContext.mediaStream?.getTracks().forEach((track) => track.stop())
@@ -159,28 +242,90 @@ function endAudioInput() {
  inputContext.audioContext2?.close()
  inputContext.scriptProcessorNode && (inputContext.scriptProcessorNode.onaudioprocess = null)
  inputContext.model?.terminate()
-  // inputContext.ws?.close()
+  if (inputContext.voskWs) {
+    inputContext.voskWs.send('{"eof" : 1}');
+    inputContext.voskWs.close();
+  }
+}
+
+function setVideoUrl(url: string) {
+  const videoEle = videoElement.value as HTMLVideoElement
+  if (!videoEle) return;
+
+  videoEle.src = url
+  videoEle.load()
+  videoEle.play()
 }

 async function onAsr(question: string) {
  endAudioInput()
  console.log('---------------->', question)
-  const videoEle = videoElement.value as HTMLVideoElement
-  if (!role || !videoEle) return
+  if (!role) return

  question = question.replace(/\s/g, '')
  for (let i = 0; i < role.qa.length; i++) {
    const { q, url } = role.qa[i]
    console.log(question + ' : ' + q)
    if (q.includes(question)) {
-      videoEle.src = url
-      videoEle.load()
-      videoEle.play()
+      const videoEle = videoElement.value as HTMLVideoElement
+      videoEle && (videoEle.loop = false);
+      videoEle && (videoEle.muted = false);
+      setVideoUrl(url);
+      return;
+    }
+  }
+
+  // 视频链接匹配不上，直接走大模型
+  const ws = await initLLMSocket()
+  let sliceAnswer = ''
+  let answer = ''
+  const answerArray: string[] = []
+  let isTime = true
+  inputContext.ws = ws
+
+  ws.onmessage = (message) => {
+    try {
+      const { text, event } = JSON.parse(message.data) as {
+        event: string
+        message_num: number
+        text: string
+      }
+
+      if (event === 'stream_end') {
+        answerArray.push(sliceAnswer)
+        runTTSTask(answerArray)
+        sliceAnswer = ''
+
+        answerArray.push(sliceAnswer)
+        sliceAnswer = ''
+        inputContext.ws?.close()
+        console.log('----------------> answer: ', answer)
+        return
      }
+
+      answer += text
+      isTime && console.time('sliceAnswer')
+      isTime = false
+      sliceAnswer += text
+
+      if (/[。，？！；,.?!;]/.test(text) && sliceAnswer.length >= 20) {
+        console.timeEnd('sliceAnswer')
+        answerArray.push(sliceAnswer)
+        runTTSTask(answerArray)
+        sliceAnswer = ''
+        isTime = true
      }
+    } catch (error) {
+      console.log('返回答案错误 -----> ' + JSON.stringify(error))
+    }
+  }
+
+  console.log('----------------> Asr:', question)
+  ws.send(JSON.stringify({ prompt: question, historys_list: [] }))
+
 }

-function initSocket(): Promise<WebSocket> {
+function initLLMSocket(): Promise<WebSocket> {
  const ws = new WebSocket(settings.llmUrl)
  return new Promise((resolve, reject) => {
    ws.onopen = () => resolve(ws)
@@ -197,10 +342,20 @@ async function runTTSTask(tasks: string[]) {
    while (tasks.length) {
      const task = tasks.shift()
      if (!task) break
+      if (task.length < 1) continue
+
      console.time(task + ' TTS: ')
-      const res = await localTTS({ url: settings.ttsHost, text: task })
-      console.log('----------------> TTS:', res)
+      const res = await localTTS({
+        url: settings.ttsHost,
+        text: task,
+        audio_path: settings.userData
+      })
+      console.log('----------------> TTS:', res[0].text)
      console.timeEnd(task + ' TTS: ')
+      const audio = new Audio(`file://${res[0].text}`)
+      audio.load()
+      ttsAudios.push(audio)
+      runAudioPlay()
    }
  } catch (error) {
    console.error(error)
@@ -209,6 +364,28 @@ async function runTTSTask(tasks: string[]) {
  isTTSRunning = false
 }

+const ttsAudios: HTMLAudioElement[] = []
+let isPlayRunning = false
+async function runAudioPlay() {
+  if (isPlayRunning) return
+  isPlayRunning = true
+
+  const audio = ttsAudios.shift()
+  if (!audio) {
+    isPlayRunning = false
+    return
+  }
+  audio.onended = () => {
+    isPlayRunning = false
+    const videoEle = videoElement.value as HTMLVideoElement
+    videoEle && (videoEle.loop = true);
+    videoEle && (videoEle.muted = true);
+    setVideoUrl(new URL('/libai/10.mp4', import.meta.url).href);
+    runAudioPlay()
+  }
+  await audio.play()
+}
+
 // eslint-disable-next-line no-unused-vars
 async function xfTTS(text: string) {
  const tone = settings.source.find(({ sourceId }) => settings.selectSource === sourceId)
@@ -240,7 +417,7 @@ async function xfTTS(text: string) {
      variant="elevated"
      size="x-large"
      :disabled="microphoneState === 'loading' || microphoneState === 'disabled'"
-      @pointerdown="startAudioInput"
+      @pointerdown="startVoskWsAudioInput"
    >
      <v-icon v-if="microphoneState === 'waitInput'" icon="mdi-microphone"></v-icon>
      <v-icon v-if="microphoneState === 'loading'" icon="mdi-microphone-settings"></v-icon>

--- a/src/renderer/store/settings.ts
+++ b/src/renderer/store/settings.ts
@@ -25,6 +25,7 @@ export type ISettings = {
  isFullscreen: 'yes' | 'no'
  isOpenDevTools: boolean
  llmUrl: string
+  voskWsLUrl: string
 }

 const useSettingsStore = defineStore('settings', {
@@ -57,7 +58,8 @@ const useSettingsStore = defineStore('settings', {
      selectSource: '',
      isFullscreen: 'no',
      isOpenDevTools: false,
-      llmUrl: 'ws://127.0.0.1:9001/api/v1/stream'
+      llmUrl: 'ws://127.0.0.1:9001/api/v1/stream',
+      voskWsLUrl: 'ws://127.0.0.1:2700'
    }) as ISettings,
  getters: {},
  actions: {