Commit fec7d389 authored by ali's avatar ali

feat: asr 采用 ws 方式,视频数字人遇到无法匹配直接转大模型

parent c1557511
...@@ -250,7 +250,7 @@ function endAudioInput() { ...@@ -250,7 +250,7 @@ function endAudioInput() {
async function onAsr(question: string) { async function onAsr(question: string) {
console.log('---------------->question: ', question) console.log('---------------->question: ', question)
endAudioInput() endAudioInput()
const ws = await initSocket() const ws = await initLLMSocket()
inputContext.ws = ws inputContext.ws = ws
let sliceAnswer = '' let sliceAnswer = ''
...@@ -299,7 +299,7 @@ async function onAsr(question: string) { ...@@ -299,7 +299,7 @@ async function onAsr(question: string) {
ws.send(JSON.stringify({ prompt: question, historys_list: [] })) ws.send(JSON.stringify({ prompt: question, historys_list: [] }))
} }
function initSocket(): Promise<WebSocket> { function initLLMSocket(): Promise<WebSocket> {
const ws = new WebSocket(settings.llmUrl) const ws = new WebSocket(settings.llmUrl)
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
ws.onopen = () => resolve(ws) ws.onopen = () => resolve(ws)
......
...@@ -14,7 +14,8 @@ import useStore from '@/renderer/store' ...@@ -14,7 +14,8 @@ import useStore from '@/renderer/store'
const router = useRouter() const router = useRouter()
const route = useRoute() const route = useRoute()
const { settings, video: useVideo } = useStore() const { settings, video: useVideo } = useStore()
const sampleRate = 48000 let sampleRate = 48000
const bufferSize = 8192;
const iconMicrophone = new URL('/images/microphone-input.svg', import.meta.url).href const iconMicrophone = new URL('/images/microphone-input.svg', import.meta.url).href
const recordVolume = ref(0) const recordVolume = ref(0)
...@@ -95,9 +96,13 @@ const inputContext: { ...@@ -95,9 +96,13 @@ const inputContext: {
scriptProcessorNode?: ScriptProcessorNode scriptProcessorNode?: ScriptProcessorNode
model?: Model model?: Model
ws?: WebSocket ws?: WebSocket
} = {} voskWs?: WebSocket
asrPartial: string
} = {
asrPartial: ''
}
async function startAudioInput() { async function startVoskWasmAudioInput() {
if (microphoneState.value === 'loading') return if (microphoneState.value === 'loading') return
if (microphoneState.value === 'input') { if (microphoneState.value === 'input') {
...@@ -114,6 +119,7 @@ async function startAudioInput() { ...@@ -114,6 +119,7 @@ async function startAudioInput() {
} }
}) })
sampleRate = 48000
const mediaStream = await navigator.mediaDevices.getUserMedia({ const mediaStream = await navigator.mediaDevices.getUserMedia({
video: false, video: false,
audio: { audio: {
...@@ -152,6 +158,83 @@ async function startAudioInput() { ...@@ -152,6 +158,83 @@ async function startAudioInput() {
inputContext.audioContext = audioContext inputContext.audioContext = audioContext
} }
async function startVoskWsAudioInput() {
if (microphoneState.value === 'loading') return
if (microphoneState.value === 'input') {
endAudioInput()
return
}
initVoskWS()
sampleRate = 8000
const mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
channelCount: 1,
sampleRate
}, video: false
});
const audioContext = new AudioContext({sampleRate});
const source = audioContext.createMediaStreamSource(mediaStream);
const processor = audioContext.createScriptProcessor();
source.connect(processor);
processor.connect(audioContext.destination);
processor.onaudioprocess = (audioDataChunk) => postAudio(audioDataChunk);
await analyzeMicrophoneVolume(mediaStream, (val) => {
recordVolume.value = val
})
microphoneState.value = 'input'
inputContext.audioContext = audioContext
inputContext.mediaStream = mediaStream
}
function postAudio(audioDataChunk) {
if (!inputContext.voskWs) return;
if (inputContext.voskWs.readyState === WebSocket.OPEN) {
const inputData = audioDataChunk.inputBuffer.getChannelData(0) || new Float32Array(bufferSize);
const targetBuffer = new Int16Array(inputData.length);
for (let index = inputData.length; index > 0; index--) {
targetBuffer[index] = 32767 * Math.min(1, inputData[index]);
}
inputContext.voskWs.send(targetBuffer.buffer);
}
}
function initVoskWS() {
return new Promise((resolve, reject) => {
inputContext.voskWs = new WebSocket(settings.voskWsLUrl);
inputContext.voskWs.binaryType = "arraybuffer";
inputContext.asrPartial = '';
inputContext.voskWs.onopen = function (event) {
resolve(inputContext.voskWs);
};
inputContext.voskWs.onerror = function (event) {
reject(new Error(JSON.stringify(event)))
};
inputContext.voskWs.onmessage = function (event) {
if (!event.data) return
const parsed = JSON.parse(event.data);
if (parsed.partial && parsed.partial !== 'the') inputContext.asrPartial = parsed.partial+'|';
// if (parsed.result) console.log(parsed.result);
if (parsed.text) {
inputContext.asrPartial = parsed.text;
onAsr(inputContext.asrPartial);
};
};
})
}
function endAudioInput() { function endAudioInput() {
microphoneState.value = 'waitInput' microphoneState.value = 'waitInput'
inputContext.mediaStream?.getTracks().forEach((track) => track.stop()) inputContext.mediaStream?.getTracks().forEach((track) => track.stop())
...@@ -159,28 +242,90 @@ function endAudioInput() { ...@@ -159,28 +242,90 @@ function endAudioInput() {
inputContext.audioContext2?.close() inputContext.audioContext2?.close()
inputContext.scriptProcessorNode && (inputContext.scriptProcessorNode.onaudioprocess = null) inputContext.scriptProcessorNode && (inputContext.scriptProcessorNode.onaudioprocess = null)
inputContext.model?.terminate() inputContext.model?.terminate()
// inputContext.ws?.close() if (inputContext.voskWs) {
inputContext.voskWs.send('{"eof" : 1}');
inputContext.voskWs.close();
}
}
function setVideoUrl(url: string) {
const videoEle = videoElement.value as HTMLVideoElement
if (!videoEle) return;
videoEle.src = url
videoEle.load()
videoEle.play()
} }
async function onAsr(question: string) { async function onAsr(question: string) {
endAudioInput() endAudioInput()
console.log('---------------->', question) console.log('---------------->', question)
const videoEle = videoElement.value as HTMLVideoElement if (!role) return
if (!role || !videoEle) return
question = question.replace(/\s/g, '') question = question.replace(/\s/g, '')
for (let i = 0; i < role.qa.length; i++) { for (let i = 0; i < role.qa.length; i++) {
const { q, url } = role.qa[i] const { q, url } = role.qa[i]
console.log(question + ' : ' + q) console.log(question + ' : ' + q)
if (q.includes(question)) { if (q.includes(question)) {
videoEle.src = url const videoEle = videoElement.value as HTMLVideoElement
videoEle.load() videoEle && (videoEle.loop = false);
videoEle.play() videoEle && (videoEle.muted = false);
setVideoUrl(url);
return;
} }
} }
// 视频链接匹配不上,直接走大模型
const ws = await initLLMSocket()
let sliceAnswer = ''
let answer = ''
const answerArray: string[] = []
let isTime = true
inputContext.ws = ws
ws.onmessage = (message) => {
try {
const { text, event } = JSON.parse(message.data) as {
event: string
message_num: number
text: string
}
if (event === 'stream_end') {
answerArray.push(sliceAnswer)
runTTSTask(answerArray)
sliceAnswer = ''
answerArray.push(sliceAnswer)
sliceAnswer = ''
inputContext.ws?.close()
console.log('----------------> answer: ', answer)
return
}
answer += text
isTime && console.time('sliceAnswer')
isTime = false
sliceAnswer += text
if (/[。,?!;,.?!;]/.test(text) && sliceAnswer.length >= 20) {
console.timeEnd('sliceAnswer')
answerArray.push(sliceAnswer)
runTTSTask(answerArray)
sliceAnswer = ''
isTime = true
}
} catch (error) {
console.log('返回答案错误 -----> ' + JSON.stringify(error))
}
}
console.log('----------------> Asr:', question)
ws.send(JSON.stringify({ prompt: question, historys_list: [] }))
} }
function initSocket(): Promise<WebSocket> { function initLLMSocket(): Promise<WebSocket> {
const ws = new WebSocket(settings.llmUrl) const ws = new WebSocket(settings.llmUrl)
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
ws.onopen = () => resolve(ws) ws.onopen = () => resolve(ws)
...@@ -197,10 +342,20 @@ async function runTTSTask(tasks: string[]) { ...@@ -197,10 +342,20 @@ async function runTTSTask(tasks: string[]) {
while (tasks.length) { while (tasks.length) {
const task = tasks.shift() const task = tasks.shift()
if (!task) break if (!task) break
if (task.length < 1) continue
console.time(task + ' TTS: ') console.time(task + ' TTS: ')
const res = await localTTS({ url: settings.ttsHost, text: task }) const res = await localTTS({
console.log('----------------> TTS:', res) url: settings.ttsHost,
text: task,
audio_path: settings.userData
})
console.log('----------------> TTS:', res[0].text)
console.timeEnd(task + ' TTS: ') console.timeEnd(task + ' TTS: ')
const audio = new Audio(`file://${res[0].text}`)
audio.load()
ttsAudios.push(audio)
runAudioPlay()
} }
} catch (error) { } catch (error) {
console.error(error) console.error(error)
...@@ -209,6 +364,28 @@ async function runTTSTask(tasks: string[]) { ...@@ -209,6 +364,28 @@ async function runTTSTask(tasks: string[]) {
isTTSRunning = false isTTSRunning = false
} }
const ttsAudios: HTMLAudioElement[] = []
let isPlayRunning = false
async function runAudioPlay() {
if (isPlayRunning) return
isPlayRunning = true
const audio = ttsAudios.shift()
if (!audio) {
isPlayRunning = false
return
}
audio.onended = () => {
isPlayRunning = false
const videoEle = videoElement.value as HTMLVideoElement
videoEle && (videoEle.loop = true);
videoEle && (videoEle.muted = true);
setVideoUrl(new URL('/libai/10.mp4', import.meta.url).href);
runAudioPlay()
}
await audio.play()
}
// eslint-disable-next-line no-unused-vars // eslint-disable-next-line no-unused-vars
async function xfTTS(text: string) { async function xfTTS(text: string) {
const tone = settings.source.find(({ sourceId }) => settings.selectSource === sourceId) const tone = settings.source.find(({ sourceId }) => settings.selectSource === sourceId)
...@@ -240,7 +417,7 @@ async function xfTTS(text: string) { ...@@ -240,7 +417,7 @@ async function xfTTS(text: string) {
variant="elevated" variant="elevated"
size="x-large" size="x-large"
:disabled="microphoneState === 'loading' || microphoneState === 'disabled'" :disabled="microphoneState === 'loading' || microphoneState === 'disabled'"
@pointerdown="startAudioInput" @pointerdown="startVoskWsAudioInput"
> >
<v-icon v-if="microphoneState === 'waitInput'" icon="mdi-microphone"></v-icon> <v-icon v-if="microphoneState === 'waitInput'" icon="mdi-microphone"></v-icon>
<v-icon v-if="microphoneState === 'loading'" icon="mdi-microphone-settings"></v-icon> <v-icon v-if="microphoneState === 'loading'" icon="mdi-microphone-settings"></v-icon>
......
...@@ -25,6 +25,7 @@ export type ISettings = { ...@@ -25,6 +25,7 @@ export type ISettings = {
isFullscreen: 'yes' | 'no' isFullscreen: 'yes' | 'no'
isOpenDevTools: boolean isOpenDevTools: boolean
llmUrl: string llmUrl: string
voskWsLUrl: string
} }
const useSettingsStore = defineStore('settings', { const useSettingsStore = defineStore('settings', {
...@@ -57,7 +58,8 @@ const useSettingsStore = defineStore('settings', { ...@@ -57,7 +58,8 @@ const useSettingsStore = defineStore('settings', {
selectSource: '', selectSource: '',
isFullscreen: 'no', isFullscreen: 'no',
isOpenDevTools: false, isOpenDevTools: false,
llmUrl: 'ws://127.0.0.1:9001/api/v1/stream' llmUrl: 'ws://127.0.0.1:9001/api/v1/stream',
voskWsLUrl: 'ws://127.0.0.1:2700'
}) as ISettings, }) as ISettings,
getters: {}, getters: {},
actions: { actions: {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment