Commit 71728fa5 authored by ali's avatar ali

feat: llm websocket 重写成 llm loop post

parent 19dfc9af
......@@ -118,17 +118,22 @@ export default class IPCs {
static initializeChildWindow(window: BrowserWindow) {
ipcMain.on('fileUpload', async (event, path: string) => {
const content = IPCs.readFile(path)
const formData = new FormData()
const blob = new Blob([content], { type: 'audio/wav' })
formData.append('file', blob)
const response = await http({
url: 'https://beta.laihua.com/api/upload/file',
method: 'POST',
data: formData
})
window.webContents.send('msgReceivedFileUploadResponse', response)
try {
const content = IPCs.readFile(path)
const formData = new FormData()
const blob = new Blob([content], { type: 'audio/wav' })
formData.append('file', blob)
const response = await http({
url: 'https://beta.laihua.com/api/upload/file',
method: 'POST',
data: formData
})
window.webContents.send('msgReceivedFileUploadResponse', response)
} catch (error) {
window.webContents.send('msgReceivedFileUploadResponse', { code: 500, message: JSON.stringify(error) })
}
})
}
}
......@@ -26,7 +26,6 @@ const microphoneState = ref<'waitInput' | 'input' | 'loading' | 'disabled' | 're
const videoElement = ref<HTMLVideoElement | null>(null)
const can = ref<HTMLCanvasElement | null>(null)
let photoRole: PhotoRole | null = null
let flvPlayer: flvjs.Player | null = null
const inputContext: {
mediaStream?: MediaStream
audioContext?: AudioContext
......@@ -79,8 +78,6 @@ async function init() {
photoRole = new PhotoRole(settings.liveHost, `${item?.liveUrl}`, canvasEle)
photoRole.on('asyncAnswer', onAsyncAnswer)
// initPlayer(videoEle);
try {
await photoRole.init()
} catch (error) {
......@@ -113,78 +110,10 @@ async function onAsyncAnswer(ans: PhotoAnswer) {
}
}
function draw(
ctx: CanvasRenderingContext2D,
img: HTMLImageElement,
liveVideo?: HTMLVideoElement,
videoInfo?: {
center: {
x: number
y: number
}
width: number
height: number
r_w: number
r_h: number
}
) {
ctx.clearRect(0, 0, img.naturalWidth, img.naturalHeight)
ctx.drawImage(img, 0, 0, img.naturalWidth, img.naturalHeight)
if (liveVideo && videoInfo) {
const { center, r_w, r_h } = videoInfo
ctx.drawImage(liveVideo, center.x - r_w / 2, center.y - r_h / 2, r_w, r_h)
}
}
async function initPlayer(videoEle: HTMLVideoElement) {
flvPlayer = flvjs.createPlayer(
{
url: 'http://127.0.0.1:7001/live/movie.flv',
type: 'flv',
isLive: true,
cors: true
},
{
// enableWorker: true,
enableStashBuffer: false,
stashInitialSize: 128
}
)
flvPlayer.attachMediaElement(videoEle)
flvPlayer.load()
await flvPlayer.play()
}
router.beforeEach((g) => {
if (!g.query.url) return router.push('/error')
})
async function initVosk({
result,
partialResult
}: {
result?: (string) => void
partialResult?: (string) => void
}) {
const channel = new MessageChannel()
const model = await settings.downLoadVoskModel()
const recognizer = new model.KaldiRecognizer(sampleRate)
model.registerPort(channel.port1)
recognizer.setWords(true)
recognizer.on('result', (message) => {
result && result((message as ServerMessageResult).result.text)
})
recognizer.on('partialresult', (message) => {
partialResult && partialResult((message as ServerMessagePartialResult).result.partial)
})
return { recognizer, channel }
}
function analyzeMicrophoneVolume(stream: MediaStream, callback: (number) => void) {
const audioContext = new AudioContext()
const analyser = audioContext.createAnalyser()
......@@ -214,62 +143,6 @@ function analyzeMicrophoneVolume(stream: MediaStream, callback: (number) => void
inputContext.scriptProcessorNode = recordEventNode
}
async function startVoskWasmAudioInput() {
if (microphoneState.value === 'loading') return
if (microphoneState.value === 'input') {
endAudioInput()
return
}
microphoneState.value = 'loading'
const { recognizer, channel } = await initVosk({
result: onQ,
partialResult: (text) => {
// console.log('----------------> partialResult:', text)
}
})
sampleRate = 48000
const mediaStream = await navigator.mediaDevices.getUserMedia({
video: false,
audio: {
echoCancellation: true,
noiseSuppression: true,
channelCount: 1,
sampleRate
}
})
const audioContext = new AudioContext()
await audioContext.audioWorklet.addModule(
new URL('/vosk/recognizer-processor.js', import.meta.url)
)
const recognizerProcessor = new AudioWorkletNode(audioContext, 'recognizer-processor', {
channelCount: 1,
numberOfInputs: 1,
numberOfOutputs: 1
})
recognizerProcessor.port.postMessage({ action: 'init', recognizerId: recognizer.id }, [
channel.port2
])
recognizerProcessor.connect(audioContext.destination)
const source = audioContext.createMediaStreamSource(mediaStream)
source.connect(recognizerProcessor)
await analyzeMicrophoneVolume(mediaStream, (val) => {
recordVolume.value = val
})
microphoneState.value = 'input'
inputContext.mediaStream = mediaStream
inputContext.audioContext = audioContext
}
async function startVoskWsAudioInput() {
if (microphoneState.value === 'loading') return
......@@ -358,15 +231,21 @@ function initVoskWS() {
})
}
function initLLMSocket(): Promise<WebSocket> {
const ws = new WebSocket(settings.llmUrl)
return new Promise((resolve, reject) => {
ws.onopen = () => resolve(ws)
ws.onerror = reject
})
async function llmEnd() {
const resp = (await (await fetch(`${settings.llmUrl}/api/v1/interrupt`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ end: 1 }),
mode: 'cors'
})).json() );
console.log('---------------->', resp);
}
async function endAudioInput() {
await llmEnd()
microphoneState.value = 'waitInput'
inputContext.ws?.close()
inputContext.mediaStream?.getTracks().forEach((track) => track.stop())
......@@ -422,132 +301,93 @@ async function onQ(question: string) {
microphoneState.value = 'loading'
try {
const ws = await initLLMSocket()
const { pose, stepResolve, stepReject } = createStep()
const messageTimeout = setTimeout(async () => {
showError('llm:timeout!')
await endAudioInput()
microphoneState.value = 'waitInput'
}, 10000)
let sliceAnswer = ''
let answer = ''
let isTime = true
let sliceAnswerLength = 10
inputContext.ws = ws
inputContext.answerArray.length = 0
inputContext.steps.length = 0
inputContext.steps.push(pose)
photoRole!.answerArgs = new PhotoAnswer()
photoRole!.on('asyncAnswer', onAsyncAnswer)
ws.onmessage = (message) => {
clearTimeout(messageTimeout)
try {
let { text, event } = JSON.parse(message.data) as {
event: string
message_num: number
text: string
}
if (event === 'stream_end') {
inputContext.answerArray.push({ text: sliceAnswer, isLast: true })
sliceAnswer = ''
runTTSTask()
inputContext.ws?.close()
console.log('----------------> answer: ', answer)
stepResolve('chat')
return
}
text = text.replace(/\u0000/g, '').trim()
answer += text
photoRole!.answerArgs!.answer += answer
photoRole!.answerArgs!._typingAnswer.push(answer)
isTime && console.time('sliceAnswer')
isTime = false
const textArr = text.split('')
for (let i = 0; i < textArr.length; i++) {
const t = textArr[i]
sliceAnswer += t
if (/[。,?!;,.?!;]/.test(t) && sliceAnswer.length >= sliceAnswerLength) {
console.timeEnd('sliceAnswer')
sliceAnswerLength = settings.llmToTTSSliceLength
inputContext.answerArray.push({ text: sliceAnswer, isLast: true })
runTTSTask()
sliceAnswer = ''
isTime = true
}
}
} catch (error) {
showError('llm:' + error)
endAudioInput().then(() => {
microphoneState.value = 'waitInput'
})
stepReject(JSON.stringify(error))
}
}
const { pose, stepResolve, stepReject } = createStep()
inputContext.steps.length = 0
inputContext.steps.push(pose)
ws.send(JSON.stringify({ prompt: question, historys_list: [] }))
try {
await llmLoop(question);
stepResolve('llm')
} catch (error) {
console.error(error)
microphoneState.value = 'input'
showError(`llm${JSON.stringify(error)}`)
showError(`llm${error}`)
}
}
let isTTSRunning = false
async function runTTSTask() {
if (isTTSRunning) return
isTTSRunning = true
async function llmLoop(question: string) {
microphoneState.value = 'loading'
const resp = (await (await fetch(`${settings.llmUrl}/api/v1/generate`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ question }),
mode: 'cors'
})).json() );
const { pose, stepResolve, stepReject } = createStep()
inputContext.steps.push(pose)
if (resp.results[0].status_code !== 100) {
throw new Error(`status_code: ${resp.results[0].status_code}; ${ JSON.stringify(resp.results) }`);
}
try {
while (inputContext.answerArray.length) {
const task = inputContext.answerArray.shift()
if (!task) break
if (task.text.trim().length < 1) continue
console.time(task.text + ' TTS: ')
const res = await localTTS({
url: settings.ttsHost,
text: task.text,
audio_path: settings.userData
})
inputContext.steps.length = 0
photoRole!.answerArgs = new PhotoAnswer()
photoRole!.on('asyncAnswer', onAsyncAnswer)
let index = 0;
while (true) {
// @ts-ignore
if (microphoneState.value === 'input' || microphoneState.value === 'waitInput') {
break
}
console.log('----------------> TTS:', res[0].text)
console.timeEnd(task.text + ' TTS: ')
console.log('---------------->', res[0].text)
await new Promise( resolve => setTimeout(resolve, 100))
const audioPath = await uploadFile({ filePath: res[0].text })
const { results } = (await (await fetch(`${settings.llmUrl}/api/v1/audio`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
mode: 'cors',
body: JSON.stringify({ question })
})).json() );
// @ts-ignore
if (microphoneState.value === 'input') {
break
}
const audioList = results[0].audio_list as string[];
if (audioList.length === 0) continue;
const isEnd = audioList.at(-1) === 'stream_end';
if(isEnd) audioList.pop();
const newList = audioList.slice(index);
if (newList.length === 0 && isEnd) break;
if (newList.length === 0) continue;
for (let i = index; i < audioList.length; i++) {
console.log(results[0].text[i] +':'+ audioList[i]);
photoRole!.answerArgs!.answer += results[0].text[i]
photoRole!.answerArgs!._typingAnswer.push(...results[0].text[i].split(''))
}
index += newList.length;
const audioPaths = await Promise.all(newList.map(path => {
return uploadFile({ filePath: path })
}))
// @ts-ignore
if (microphoneState.value === 'input' || microphoneState.value === 'waitInput') {
break
}
audioPaths.forEach(audioPath => {
photoRole?.enQueue({
taskId: photoRole.sessionId,
audioUrl: `https://resources.laihua.com/${audioPath}`,
isLast: task.isLast
isLast: isEnd
})
}
} catch (error) {
showError('tts:' + error)
endAudioInput().then(() => {
microphoneState.value = 'waitInput'
})
stepReject(JSON.stringify(error))
}
isTTSRunning = false
stepResolve('TTS')
if (isEnd) break;
}
}
function uploadFile({ filePath }: { filePath: string }) {
......@@ -565,24 +405,6 @@ function uploadFile({ filePath }: { filePath: string }) {
})
}
const ttsAudios: HTMLAudioElement[] = []
let isPlayRunning = false
async function runAudioPlay() {
if (isPlayRunning) return
isPlayRunning = true
const audio = ttsAudios.shift()
if (!audio) {
isPlayRunning = false
return
}
audio.onended = () => {
isPlayRunning = false
runAudioPlay()
}
await audio.play()
}
async function down() {
if (microphoneState.value === 'reply') {
await endAudioInput()
......
......@@ -34,11 +34,13 @@ const inputContext: {
ws?: WebSocket
voskWs?: WebSocket
asrPartial: string
llmEnd: boolean
ttsAudios: HTMLAudioElement[]
playingAudio?: HTMLAudioElement
answerArray: string[]
} = {
asrPartial: '',
answerArray: []
llmEnd: false,
ttsAudios: []
}
onMounted(() => {
......@@ -56,30 +58,6 @@ const showError = (msg: string) => {
errorMsg.value = msg
}
async function initVosk({
result,
partialResult
}: {
result?: (string) => void
partialResult?: (string) => void
}) {
const channel = new MessageChannel()
const model = await settings.downLoadVoskModel()
const recognizer = new model.KaldiRecognizer(sampleRate)
model.registerPort(channel.port1)
recognizer.setWords(true)
recognizer.on('result', (message) => {
result && result((message as ServerMessageResult).result.text)
})
recognizer.on('partialresult', (message) => {
partialResult && partialResult((message as ServerMessagePartialResult).result.partial)
})
return { recognizer, channel }
}
function analyzeMicrophoneVolume(stream: MediaStream, callback: (number) => void) {
const audioContext = new AudioContext()
const analyser = audioContext.createAnalyser()
......@@ -109,62 +87,6 @@ function analyzeMicrophoneVolume(stream: MediaStream, callback: (number) => void
inputContext.scriptProcessorNode = recordEventNode
}
async function startVoskWasmAudioInput() {
if (microphoneState.value === 'loading') return
if (microphoneState.value === 'input') {
endAudioInput()
return
}
microphoneState.value = 'loading'
const { recognizer, channel } = await initVosk({
result: onQ,
partialResult: (text) => {
// console.log('----------------> partialResult:', text)
}
})
sampleRate = 48000
const mediaStream = await navigator.mediaDevices.getUserMedia({
video: false,
audio: {
echoCancellation: true,
noiseSuppression: true,
channelCount: 1,
sampleRate
}
})
const audioContext = new AudioContext()
await audioContext.audioWorklet.addModule(
new URL('/vosk/recognizer-processor.js', import.meta.url)
)
const recognizerProcessor = new AudioWorkletNode(audioContext, 'recognizer-processor', {
channelCount: 1,
numberOfInputs: 1,
numberOfOutputs: 1
})
recognizerProcessor.port.postMessage({ action: 'init', recognizerId: recognizer.id }, [
channel.port2
])
recognizerProcessor.connect(audioContext.destination)
const source = audioContext.createMediaStreamSource(mediaStream)
source.connect(recognizerProcessor)
await analyzeMicrophoneVolume(mediaStream, (val) => {
recordVolume.value = val
})
microphoneState.value = 'input'
inputContext.mediaStream = mediaStream
inputContext.audioContext = audioContext
}
async function startVoskWsAudioInput() {
if (microphoneState.value === 'loading') return
......@@ -253,15 +175,21 @@ function initVoskWS() {
})
}
function initLLMSocket(): Promise<WebSocket> {
const ws = new WebSocket(settings.llmUrl)
return new Promise((resolve, reject) => {
ws.onopen = () => resolve(ws)
ws.onerror = reject
})
async function llmEnd() {
const resp = (await (await fetch(`${settings.llmUrl}/api/v1/interrupt`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ end: 1 }),
mode: 'cors'
})).json() );
console.log('---------------->', resp);
}
function endAudioInput() {
async function endAudioInput() {
await llmEnd()
microphoneState.value = 'waitInput'
inputContext.ws?.close()
inputContext.mediaStream?.getTracks().forEach((track) => track.stop())
......@@ -273,12 +201,11 @@ function endAudioInput() {
inputContext.voskWs.send('{"eof" : 1}')
inputContext.voskWs.close()
}
ttsAudios.length = 0
inputContext.ttsAudios.length = 0
inputContext.playingAudio?.pause()
videos[1].value?.pause()
videos[0].value?.pause()
isPlayRunning = false
inputContext.answerArray.length = 0
}
const canplay = () => {
......@@ -347,126 +274,104 @@ async function onQ(question: string) {
return
}
// 视频链接匹配不上,直接走大模型
try {
const ws = await initLLMSocket()
const messageTimeout = setTimeout(() => {
showError('llm:timeout!')
endAudioInput()
microphoneState.value = 'waitInput'
}, 10000)
let sliceAnswer = ''
let answer = ''
let isTime = true
let sliceAnswerLength = 10
inputContext.ws = ws
ws.onmessage = (message) => {
clearTimeout(messageTimeout)
if (microphoneState.value === 'input') {
return
}
try {
let { text, event } = JSON.parse(message.data) as {
event: string
message_num: number
text: string
}
if (event === 'stream_end') {
inputContext.answerArray.push(sliceAnswer)
runTTSTask()
sliceAnswer = ''
inputContext.ws?.close()
console.log('----------------> answer: ', answer)
return
}
text = text.replace(/\u0000/g, '').trim()
answer += text
isTime && console.time('sliceAnswer')
isTime = false
const textArr = text.split('')
for (let i = 0; i < textArr.length; i++) {
const t = textArr[i]
sliceAnswer += t
if (/[。,?!;,.?!;]/.test(t) && sliceAnswer.length >= sliceAnswerLength) {
console.timeEnd('sliceAnswer')
sliceAnswerLength = settings.llmToTTSSliceLength
inputContext.answerArray.push(sliceAnswer)
runTTSTask()
sliceAnswer = ''
isTime = true
}
}
} catch (error) {
console.error(error)
showError(`message:${error}`)
microphoneState.value = 'waitInput'
}
}
ws.send(JSON.stringify({ prompt: question, historys_list: [] }))
// 视频链接匹配不上,直接走大模型
await llmLoop(question);
} catch (error) {
console.error(error)
microphoneState.value = 'input'
showError(`llm:${JSON.stringify(error)}`)
showError(`llm:${error}`)
}
}
let isTTSRunning = false
async function runTTSTask() {
if (isTTSRunning) return
isTTSRunning = true
async function llmLoop(question: string) {
if (!role) return;
microphoneState.value = 'loading'
try {
while (inputContext.answerArray.length) {
const task = inputContext.answerArray.shift()
if (!task) break
if (task.trim().length < 1) continue
const resp = (await (await fetch(`${settings.llmUrl}/api/v1/generate`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ question }),
mode: 'cors'
})).json() );
console.time(task + ' TTS: ')
if (resp.results[0].status_code !== 100) {
throw new Error(`status_code: ${resp.results[0].status_code}; ${ JSON.stringify(resp.results) }`);
}
const res = await localTTS({
url: settings.ttsHost,
text: task,
audio_path: settings.userData
})
inputContext.llmEnd = false;
let index = 0;
console.log('----------------> TTS:', res[0].text)
console.timeEnd(task + ' TTS: ')
while (true) {
// @ts-ignore
if (microphoneState.value === 'input' || microphoneState.value === 'waitInput') {
break
}
// @ts-ignore
if (microphoneState.value === 'input') {
break
}
await new Promise( resolve => setTimeout(resolve, 100))
const audio = new Audio(`file://${res[0].text}`)
audio.load()
ttsAudios.push(audio)
runAudioPlay()
const { results } = (await (await fetch(`${settings.llmUrl}/api/v1/audio`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
mode: 'cors',
body: JSON.stringify({ question })
})).json() );
const audioList = results[0].audio_list as string[];
if (audioList.length === 0) continue;
const isEnd = audioList.at(-1) === 'stream_end';
if(isEnd) audioList.pop();
const newList = audioList.slice(index);
if (newList.length === 0 && isEnd) break;
if (newList.length === 0) continue;
for (let i = index; i < audioList.length; i++) {
console.log(results[0].text[i] +':'+ audioList[i]);
}
} catch (error) {
showError(`tts:${error}`)
microphoneState.value = 'waitInput'
console.error(error)
index += newList.length;
// @ts-ignore
if (microphoneState.value === 'input' || microphoneState.value === 'waitInput') {
break
}
// inputContext.ttsAudios.push(...newList.map(path => {
// const audio = new Audio(`file://${path}`)
// audio.load()
// return audio;
// }))
// TODO: test
inputContext.ttsAudios.push(...newList.map(path => {
const audio = new Audio(`http://192.168.1.57:6767/${path.split('\\').pop()}`)
audio.load()
return audio;
}))
runAudioPlay()
if (isEnd) break;
}
isTTSRunning = false
inputContext.llmEnd = true;
}
const ttsAudios: HTMLAudioElement[] = []
let isPlayRunning = false
async function runAudioPlay() {
if (isPlayRunning) return
isPlayRunning = true
const audio = ttsAudios.shift()
const audio = inputContext.ttsAudios.shift()
if (!audio) {
isPlayRunning = false
videos[0].value!.pause()
!isTTSRunning && (microphoneState.value = 'input')
inputContext.llmEnd && (microphoneState.value = 'input')
return
}
audio.onended = () => {
......@@ -482,23 +387,9 @@ async function runAudioPlay() {
microphoneState.value = 'reply'
}
// eslint-disable-next-line no-unused-vars
async function xfTTS(text: string) {
const tone = settings.source.find(({ sourceId }) => settings.selectSource === sourceId)
if (!tone) return
const res = await audioAiTTS({
host: settings.ttsHost,
text,
speed: 3,
speaker: tone.sourceId,
provider: tone.provider
})
console.log('----------------> tts:', res)
}
async function down() {
if (microphoneState.value === 'reply') {
endAudioInput()
await endAudioInput()
}
try {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment