前端AI语音方面的实现

前端AI语音方面的实现

webkitSpeechRecognition(语音识别)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// 创建一个webkitSpeechRecognition实例
let newRecognition = new webkitSpeechRecognition();
// 设置识别到声音就关闭还是一直识别
newRecognition.continuous = true;
// 开启录音
newRecognition.start();
// 关闭录音
newRecognition.stop();
// 识别到结果即触发,所以讲话过程多次停顿的情况下会触发多次
newRecognition.onresult = function(event) {
console.log(event);
}
// 基本以上就已经达到了语音转文字的功能,下面是一些其他事件
//只要开始讲话了,就会触发onsoundstart和onspeechstart事件
newRecognition.onsoundstart = function(e){
console.log("开始收听了");
console.log(e);
}
newRecognition.onspeechstart = (e)=>{
console.log("开始讲话了");
console.log(e);
}
// onspeechend事件应该是监测到执行了stop方法时触发的。而如果一段时间没有录入信息,则会自动执行stop方法停止录音,同样也会触发onspeechend事件
newRecognition.onspeechend = (e)=>{
console.log("讲话完毕");
console.log(e);
}
// onerror很明显是有异常时会触发该事件,测试发现当长时间没有录入导致自动关闭录音的时候就会触发该事件
newRecognition.onerror = (e)=>{
console.log("发生异常");
console.log(e);
}

vue3demo

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
<template>
<div>
<textarea
name="语音转文字内容区"
id="area"
cols="30"
rows="10"
placeholder="说点什么吧..."
ref="areaRef"
v-model="textInp"></textarea>
<div>
<button @click="speek">🎤</button>
<p @click="addText">发布</p>
</div>
<ul v-show="list.length > 0">
<li v-for="(item,index) in list" :key="index">{{ item }}</li>
</ul>
</div>
</template>

<script>
import { reactive, toRefs, ref } from 'vue'

export default {
setup () {
const areaRef = ref(null)
const state = reactive({
list: [],
textInp: '',
isGoing: false
})
var recognition = new window.webkitSpeechRecognition();
recognition.continuous = true;
recognition.interimResults = true;
recognition.lang = 'zh-CN';
recognition.onresult=function(event) {
let result = ''
for(let i = 0;i <= event.resultIndex; i++) {
result += event.results[i][0].transcript;
}
state.textInp = result
}
/** 开始 / 结束 录音 */
function speek(){
if(state.isGoing) {
recognition.stop();
state.isGoing = false;
}
else {
recognition.start();
state.isGoing = true;
}

}
/** 发布 */
function addText(){
state.list.push(state.textInp);
state.textInp = '';
}

return {
...toRefs(state),
areaRef,
addText,
speek
}
}
}
</script>

<style lang="scss" scoped>

</style>

SpeechSynthesisUtterance(语音合成)

基础测试

1
2
var ssu = new window.SpeechSynthesisUtterance('Hi,girl!');
window.speechSynthesis.speak(ssu);

SpeechSynthesisUtterance对象提供了一些其他属性供设置:

  • lang:使用的语言,字符串(比如:“zh-cn”)

  • volume:音量,值在0-1之间(默认是1)

  • rate:语速的倍数,值在0.1-10之间(默认1倍)

  • pitch:音高,值在0-2之间,(默认是1)

  • voiceURI:指定希望使用的声音,字符串

  • onstart:语音开始合成时触发

  • onpause:语音暂停时触发

  • onresume:语音合成重新开始时触发

  • onend:语音结束时触发

1
2
3
4
var ssu = new window.SpeechSynthesisUtterance();
ssu.text = 'Hi,girl!';
ssu.volume = 0.5;
ssu.rate = 1

speechSynthesis对象
创建完SpeechSynthesisUtterance对象之后,把这个对象传递给speechSynthesis对象的speak方法中。

  • stop():停止合成

  • pause():暂停合成

  • resume():重新开始合成

  • getVoices():返回浏览器支持的语音包数组

1
2
3
4
window.addEventListener("click", ()=>{
window.speechSynthesis.pause(); // 点击暂停
});
console.log(window.speechSynthesis.getVoices()); //我的chrome浏览器下竟然是空数组。。wtf

demo:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>语音转文字</title>
<style>
textarea {
width: 100%;
height: 50px;
}
</style>
</head>
<body>
<div>
<textarea shape="" coords="" href="" alt="" id="area" placeholder="请说点什么..."></textarea>
<button id="speek">麦克风</button>
<button id="addBtn">发布</button>
<ul id="text"></ul>
</div>

<script>
window.onload = () => {
console.log('页面加载完毕');
const area = document.querySelector('#area');
const speek = document.querySelector('#speek');
const addBtn = document.querySelector('#addBtn');
const text = document.querySelector('#text');
const recognition = new webkitSpeechRecognition();
let isSpeek = false;

recognition.continuous = true;
recognition.interimResults = true;
recognition.lang = 'zh-CN';
recognition.onresult=function(event) {
let result = ''
for(let i = event.resultIndex;i <= event.resultIndex; i++) {
if (event.results[i].isFinal) {
result += event.results[i][0].transcript;
}
}
area.value = result
}

speek.addEventListener('click', () => {
if(isSpeek) {
recognition.stop();
isSpeek = false;
return;
}
recognition.start();
isSpeek = true;
})

addBtn.addEventListener('click', () => {
const li = document.createElement('li');
li.textContent = area.value;
text.appendChild(li);
area.value = '';
})

}

</script>
</body>
</html>

edge-tts语音合成 web前端实现

1、使用方法:

在需要的地方导入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
var voiceList = [{"ShortName":"zh-CN-XiaoxiaoNeural","label":"Xiaoxiao"},{"ShortName":"zh-CN-XiaoyiNeural","label":"Xiaoyi"},{"ShortName":"zh-CN-YunjianNeural","label":"Yunjian"},{"ShortName":"zh-CN-YunxiNeural","label":"Yunxi"},{"ShortName":"zh-CN-YunxiaNeural","label":"Yunxia"},{"ShortName":"zh-CN-YunyangNeural","label":"Yunyang"}];
let ws = null;
let blobs = [];
let audioElement = document.createElement('audio');

function sendReq(ssml, format,connectionId){
let configData = {
context: {
synthesis: {
audio: {
metadataoptions: {
sentenceBoundaryEnabled: "false",
wordBoundaryEnabled: "false",
},
outputFormat: format,
},
},
},
};
let configMessage =
`X-Timestamp:${Date()}\r\n` +
"Content-Type:application/json; charset=utf-8\r\n" +
"Path:speech.config\r\n\r\n" +
JSON.stringify(configData);
console.log(`配置请求发送:${configMessage}\n`);
let ssmlMessage =
`X-Timestamp:${Date()}\r\n` +
`X-RequestId:${connectionId}\r\n` +
`Content-Type:application/ssml+xml\r\n` +
`Path:ssml\r\n\r\n` +
ssml;
console.log(`SSML消息发送:${ssmlMessage}\n`);
ws.send(configMessage, (configError) => {
if (configError) {
console.log(`配置请求发送失败:${connectionId}\n`);
}
});
ws.send(ssmlMessage, (ssmlError) => {
if (ssmlError) {
console.log(`SSML消息发送失败:${connectionId}\n`);
}
});
}

function generateRandomHex() {
// 创建一个长度为 16 字节的 Uint8Array
const randomBytes = new Uint8Array(16);
// 填充数组的每个元素为一个随机的 0-255 之间的整数
window.crypto.getRandomValues(randomBytes);
// 将字节数组转换为十六进制字符串,并将字母转换为小写
const hexString = Array.from(randomBytes)
.map(byte => byte.toString(16).padStart(2, '0'))
.join('')
.toLowerCase();
return hexString;
}

async function connect(ssml, format,autpPlay) {
return new Promise((resolve, reject) =>{
const connectionId = generateRandomHex();
let url = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=6A5AA1D4EAFF4E9FB37E23D68491D6F4&ConnectionId=${connectionId}`;
ws = new window.WebSocket(url);
ws.onopen = () => {
console.log("wsOpen");
sendReq(ssml, format,connectionId)
};
ws.onclose = (code, reason) => {
// 服务器会自动断开空闲超过30秒的连接
ws = null;
blobs = [];
console.log(`连接已关闭: ${reason} ${code}`);
};
ws.onmessage = (message) => {
if (!(message.data instanceof Blob)) {
let data = message.data.toString();
if (data.includes("Path:turn.start")) {
// 开始传输
} else if (data.includes("Path:turn.end")) {
// 结束传输
for(let i=0;i<blobs.length;i++){
let contentIndex = 130;
if(i == blobs.length-1){
contentIndex = 105;
}
blobs[i] = blobs[i].slice(contentIndex)
}
let result = new Blob(blobs);
let url = URL.createObjectURL(result);
if(autpPlay){
audioElement.pause();
audioElement.src = url;
audioElement.play();
}
blobs = [];
ws.close();
console.log(`传输完成:${url}`);
resolve(url);
}
} else if (message.data instanceof Blob) {
console.log("收到信号了b......",message.data)
blobs.push(message.data)
}
};
ws.onerror = (error) => {
console.log(`连接失败: ${error}`);
};
})
}


export async function start(text,voice=1,rate = 0,pitch=0,autpPlay=true) {
if(text){
let SSML = "";
console.log("text",text);
console.log("voice",voiceList[voice].ShortName);
console.log("rate",rate);
console.log("pitch",pitch);
SSML = `
<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US">
<voice name="${voiceList[voice].ShortName}">
<prosody rate="${rate}%" pitch="${pitch}%">
${text}
</prosody>
</voice>
</speak>`;
console.log(SSML);
let format = "audio-24khz-48kbitrate-mono-mp3";
let result = await connect(SSML, format,autpPlay).then(result => {
console.log('Received result:', result);
return result;
});
return result;
}
}

2、方法说明

1
2
3
4
5
6
7
start方法:传递文本及语音配置信息
调用示例
const startSpeak = async() =>{
let url = await start('欢迎来到中国联通智慧大脑展厅,接下来为各位来宾介绍政企大屏;',2,0,0);
}
返回示例:
blob:http://localhost:8080/f0cf900d-81ed-4e0f-bade-058726453781
入参字段 描述
text 需要转为语音的文字(必传)
voice 转换为中文的人物(音色,传第三部分角色说明ShortName对应对象的key值)
rate 语速,默认值为0
pitch 音调,默认值为0
autoPlay 是否自动播放,默认值true

3、角色说明

voice:声音 【0-5】

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
ShortName: 人物声音
[
{
"ShortName": "zh-CN-XiaoxiaoNeural",
"label": "Xiaoxiao"
},
{
"ShortName": "zh-CN-XiaoyiNeural",
"label": "Xiaoyi"
},
{
"ShortName": "zh-CN-YunjianNeural",
"label": "Yunjian"
},
{
"ShortName": "zh-CN-YunxiNeural",
"label": "Yunxi"
},
{
"ShortName": "zh-CN-YunxiaNeural",
"label": "Yunxia"
},
{
"ShortName": "zh-CN-YunyangNeural",
"label": "Yunyang"
}
]