vits将作为内置功能加入更新

This commit is contained in:
avilliai 2024-09-22 22:28:10 +08:00
parent 57839c37fe
commit 24a1d428f0
9 changed files with 239 additions and 206 deletions

View File

@ -11,8 +11,8 @@
fuckingnumber: 13 #低于13人退群
语音功能设置:
#建议使用modelscopeTTS或outVitsFishTTS(目前不可用),其他模式需要额外部署。
voicegenerate: modelscopeTTS #语音合成模式 modelscopeTTS/outVits/FishTTS(目前已不可用)/vits(需额外部署)/bert_vits2(需额外部署)/so-vits(需额外部署)
speaker: "东雪莲" #根据你的合成模式设定角色,请查看 Manyana/config/语音合成可用角色.txtvits则无需配置
voicegenerate: modelscopeTTS #语音合成模式 modelscopeTTS/outVits/FishTTS(目前已不可用)/vits(本地合成)/bert_vits2(需额外部署)/so-vits(需额外部署)
speaker: "东雪莲" #根据你的合成模式设定角色,最好启动后@bot 角色 以进行查看。请查看 Manyana/config/语音合成可用角色.txtvits则无需配置
voiceLangType: "<zh>" #默认语音合成语言类型,可选: <zh> <jp> <en> 目前仅在modelscopeTTS和firefly模式下生效,firefly模式下需要与speaker的语言类型匹配
prefix: "" #语音合成指令前缀,默认为 xx说 ;如果为 "/" 则指令变为 /xx说 ;用以减少误触发。
#下面的两个配置项,除非你自己额外搭了对应的服务,不然就别动。

View File

@ -80,32 +80,6 @@ with open('data/chatGLMData.yaml', 'r', encoding='utf-8') as f:
chatGLMData = cha
async def tstt(r):
data1 = {'speaker': speaker}
st8 = re.sub(r"[^]*", "", r) # 使用r前缀表示原始字符串避免转义字符的问题
data1["text"] = st8
if voicegg == "vits":
logger.info("调用vits语音回复")
path = 'data/voices/' + random_str() + '.wav'
if voiceLangType == "<jp>":
texts = await translate(str(st8))
tex = '[JA]' + texts + '[JA]'
else:
tex = "[ZH]" + st8 + "[ZH]"
logger.info("启动文本转语音text: " + tex + " path: " + path)
# spe = rte.get("defaultModel").get("speaker")
with open('config/autoSettings.yaml', 'r', encoding='utf-8') as f:
resulte = yaml.load(f.read(), Loader=yaml.FullLoader)
spe = resulte.get("defaultModel").get("speaker")
modelSelect = resulte.get("defaultModel").get("modelSelect")
await voiceGenerate({"text": tex, "out": path, "speaker": spe, "modelSelect": modelSelect})
else:
logger.info(f"调用{voicegg}语音合成")
path = await superVG(data1, voicegg, berturl, voiceLangType)
return path
async def loop_run_in_executor(executor, func, *args):
try:
r = await executor.run_in_executor(None, func, *args)

View File

@ -1,46 +1,21 @@
import datetime
import os
from vits import utils
import yaml
def modelLoader():
global modelDll
modelDll = {}
a = os.listdir('vits/voiceModel')
# print(type(a))
ind = 0
global CHOISE
CHOISE = {}
models={}
for i in a:
# print(i)
if os.path.isdir('vits/voiceModel/' + i):
# 内层循环遍历取出模型文件
file = os.listdir('vits/voiceModel/' + i)
for ass in file:
configPath = 'vits/voiceModel/' + i + '/config.json'
with open(configPath, 'r', encoding='utf-8') as file:
data = yaml.load(file, Loader=yaml.FullLoader)
speakers = data['speakers']
text_cleaners = data["data"]['text_cleaners']
for ass in os.listdir('vits/voiceModel/' + i):
if ass.endswith('.pth'):
hps_ms = utils.get_hparams_from_file('vits/voiceModel/' + i + '/config.json')
speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
muspeakers = {}
for id, name in enumerate(speakers):
muspeakers[str(id)] = name
CHOISE[name] = [str(id),
['vits/voiceModel/' + i + '/' + ass, 'vits/voiceModel/' + i + '/config.json']]
modelDll[str(ind)] = ['vits/voiceModel/' + i + '/' + ass, 'vits/voiceModel/' + i + '/config.json',
muspeakers]
time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
modelSelect = ['vits/voiceModel/' + i + '/' + ass, 'vits/voiceModel/' + i + '/config.json',
muspeakers]
#print(time + '| 已读取' + 'voiceModel/' + i + '文件夹下的模型文件' + str(muspeakers))
ind += 1
else:
pass
else:
pass
#print(modelDll)
return modelDll, modelSelect, CHOISE
modelPath = 'vits/voiceModel/' + i + '/' + ass
models[str(speakers)]={"speakers":speakers,'modelPath':modelPath,'configPath':configPath,'text_cleaners':text_cleaners}
return models

View File

@ -1,8 +1,12 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import json
import os
import re
import asyncio
from concurrent.futures import ThreadPoolExecutor
import httpx
import requests
import websockets
@ -13,7 +17,7 @@ from plugins.toolkits import translate,random_str,random_session_hash
try:
from plugins.modelsLoader import modelLoader
models, default, characters = modelLoader() # 读取模型
models = modelLoader() # 读取模型
from vits import vG
except:
pass
@ -81,22 +85,26 @@ async def superVG(data, mode, urls="", langmode="<zh>"):
text = data['text']
out = data["out"]
try:
speaker = data['speaker']
modelSelect = data['modelSelect']
except Exception as e:
speaker = 2
modelSelect = ['vits/voiceModel/nene/1374_epochsm.pth', 'vits/voiceModel/nene/config.json']
print(e)
# with open('config/settings.yaml', 'r', encoding='utf-8') as f:
# result = yaml.load(f.read(), Loader=yaml.FullLoader)
# speaker = result.get("vits").get("speaker")
# modelSelect = result.get("vits").get("modelSelect")
# 调用 voiceG() 函数
if modelSelect[0].endswith("I.pth"):
text = text.replace("[JA]", "").replace("[ZH]", "")
# print("get")
await vG(tex=text, out=out, speakerID=speaker, modelSelect=modelSelect)
speaker = data['speaker']
modelSelect = ['vits/voiceModel/nene/1374_epochsm.pth', 'vits/voiceModel/nene/config.json']
speakerId="0"
for i in models:
if speaker in i:
modelSelect = [models[i]["modelPath"], models[i]["configPath"]]
speakerId=models[i]['speakers'].index(speaker)
if "mixture" in models[i]['text_cleaners'][0]:
if langmode == "<zh>":
text =f"[ZH]{text}[ZH]"
elif langmode == "<en>":
text = f"[EN]{text}[EN]"
elif langmode == "<jp>":
text = await translate(text)
text=f"[JA]{text}[JA]"
break
loop = asyncio.get_event_loop()
# 使用线程池在子线程中运行 sync_function
await loop.run_in_executor(ThreadPoolExecutor(), vG, text, out, speakerId, modelSelect)
print("语音生成完成")
return out
elif mode == "bert_vits2":
@ -788,3 +796,5 @@ async def modelscopeTTS(data):
with open(p, "wb") as f:
f.write(r.content)
return p
#asyncio.run(superVG({"text": "你好,欢迎使用语音合成服务。", "out": "output.wav", "speaker": "綾地寧々"},"vits", urls="", langmode="<zh>"))

View File

@ -12,7 +12,9 @@ from mirai import FriendMessage, GroupMessage, At,Image
from mirai import Voice, Startup
from mirai.models import NudgeEvent
from plugins.aiReplyCore import modelReply, clearAllPrompts, tstt, clearsinglePrompt
from plugins.aiReplyCore import modelReply, clearAllPrompts,clearsinglePrompt
from plugins.toolkits import random_str
from plugins.vitsGenerate import superVG
from plugins.wReply.wontRep import wontrep
@ -67,7 +69,9 @@ def main(bot, master, logger):
maxTextLen = result.get("chatGLM").get("maxLen")
voiceRate = result.get("chatGLM").get("voiceRate")
withText = result.get("chatGLM").get("withText")
speaker = result.get("语音功能设置").get("speaker")
voicegenerateMode = result.get("语音功能设置").get("voicegenerate")
voiceLangType = result.get("语音功能设置").get("voiceLangType")
with open('config.json', 'r', encoding='utf-8') as f:
data = yaml.load(f.read(), Loader=yaml.FullLoader)
config = data
@ -157,7 +161,11 @@ def main(bot, master, logger):
r= await modelReply("指挥", event.from_id, text)
if len(r) < maxTextLen and random.randint(0, 100) < voiceRate and "出错,请重试" not in r:
try:
voiceP = await tstt(r)
path = 'data/voices/' + random_str() + '.wav'
logger.info("语音生成_文本" + text)
logger.info("语音生成_模型:" + speaker)
data = {"text": text, "out": path, 'speaker': speaker}
voiceP = await superVG(data,mode=voicegenerateMode,urls="",langmode=voiceLangType )
await bot.send_group_message(event.subject.id, Voice(path=voiceP))
if withText:
await bot.send_group_message(event.subject.id, r)
@ -206,7 +214,11 @@ def main(bot, master, logger):
await bot.send(event, "如对话异常请发送 /clear 以清理对话", True)
if len(r) < maxTextLen and random.randint(0, 100) < voiceRate and "出错,请重试" not in r:
try:
voiceP = await tstt(r)
path = 'data/voices/' + random_str() + '.wav'
logger.info("语音生成_文本" + text)
logger.info("语音生成_模型:" + speaker)
data = {"text": text, "out": path, 'speaker': speaker}
voiceP = await superVG(data, mode=voicegenerateMode, urls="", langmode=voiceLangType)
await bot.send(event, Voice(path=voiceP))
if withText:
await bot.send(event, r, True)
@ -366,7 +378,11 @@ def main(bot, master, logger):
chattingUser[user] = datetime.datetime.now()
if len(r) < maxTextLen and random.randint(0, 100) < voiceRate and "出错,请重试" not in r:
try:
voiceP = await tstt(r)
path = 'data/voices/' + random_str() + '.wav'
logger.info("语音生成_文本" + text)
logger.info("语音生成_模型:" + speaker)
data = {"text": text, "out": path, 'speaker': speaker}
voiceP = await superVG(data, mode=voicegenerateMode, urls="", langmode=voiceLangType)
await bot.send(event, Voice(path=voiceP))
if withText:
await bot.send(event, r, True)

View File

@ -18,17 +18,7 @@ def main(bot, master, logger, berturl, proxy):
transLateData = yaml.load(file, Loader=yaml.FullLoader)
prob = result.get("prob")
logger.info("读取到apiKey列表")
global models
global characters
try:
from plugins.modelsLoader import modelLoader
models, default, characters = modelLoader() # 读取模型
except:
logger.error("缺少本地vits模型无法使用vits模式(非必要)")
logger.warning(
"如有需要请从https://github.com/avilliai/Manyana/releases/download/Manyana/1374_epochsm.pth下载或在群628763673内获取")
logger.warning("下载后将其放置在vits/voiceModel/nene文件夹下")
logger.warning("然后执行更新脚本的 下载vits依赖 选项")
with open('config/settings.yaml', 'r', encoding='utf-8') as f:
result0 = yaml.load(f.read(), Loader=yaml.FullLoader)
speaker92 = result0.get("语音功能设置").get("speaker")
@ -36,39 +26,6 @@ def main(bot, master, logger, berturl, proxy):
nudgeornot = result0.get("chatGLM").get("nudgeReply")
logger.info("语音合成模式:" + voicegg + " 语音合成speaker" + speaker92)
if voicegg == "vits":
with open('config/autoSettings.yaml', 'r', encoding='utf-8') as f:
result2 = yaml.load(f.read(), Loader=yaml.FullLoader)
global modelSelect
global speaker
speaker = result2.get("defaultModel").get("speaker")
modelSelect = result2.get("defaultModel").get("modelSelect")
@bot.on(GroupMessage)
async def setDefaultModel(event: GroupMessage):
if event.sender.id == master and str(event.message_chain).startswith("设定角色#"):
global speaker
global modelSelect
if str(event.message_chain).split("#")[1] in characters:
speaker1 = str(event.message_chain).split("#")[1]
logger.info("尝试设定角色:" + speaker1)
speaker = int(characters.get(speaker1)[0])
modelSelect = characters.get(speaker1)[1]
logger.info("设置了语音生成_speaker" + str(speaker))
logger.info("设置了语音生成_模型:" + str(modelSelect))
with open('config/autoSettings.yaml', 'r', encoding='utf-8') as f:
result = yaml.load(f.read(), Loader=yaml.FullLoader)
defaultModel = result.get("defaultModel")
defaultModel["speaker"] = speaker
defaultModel["modelSelect"] = modelSelect
result["defaultModel"] = defaultModel
with open('config/autoSettings.yaml', 'w', encoding="utf-8") as file:
yaml.dump(result, file, allow_unicode=True)
await bot.send(event, "成功设置了vits语音生成默认角色为" + speaker1)
else:
await bot.send(event, "不存在的vits角色")
@bot.on(NudgeEvent)
async def NudgeReply(event: NudgeEvent):
global transLateData

View File

@ -5,6 +5,8 @@ import yaml
from mirai import GroupMessage, At, Plain,MessageChain
from mirai import Voice
from mirai.models import ForwardMessageNode, Forward
from plugins.modelsLoader import modelLoader
from plugins.toolkits import translate,random_str
from plugins.vitsGenerate import voiceGenerate, superVG, fetch_FishTTS_ModelId, sovits, taffySayTest
@ -76,36 +78,8 @@ def main(bot, master, logger):
"满穗",
"桑帛"
]
with open('config/autoSettings.yaml', 'r', encoding='utf-8') as f:
result2 = yaml.load(f.read(), Loader=yaml.FullLoader)
global modelSelect
global speaker
speaker = result2.get("defaultModel").get("speaker")
modelSelect = result2.get("defaultModel").get("modelSelect")
global models
global characters
try:
from plugins.modelsLoader import modelLoader
models, default, characters = modelLoader() # 读取模型
except Exception as e:
characters = {"None": "无可用模型"}
@bot.on(GroupMessage)
async def setDefaultModel(event: GroupMessage):
if event.sender.id == master and str(event.message_chain).startswith("设定角色#"):
global speaker
global modelSelect
if str(event.message_chain).split("#")[1] in characters:
speaker1 = str(event.message_chain).split("#")[1]
logger.info("尝试设定角色:" + speaker1)
speaker = int(characters.get(speaker1)[0])
modelSelect = characters.get(speaker1)[1]
logger.info("设置了语音生成_speaker" + str(speaker))
logger.info("设置了语音生成_模型:" + str(modelSelect))
# modelSelect=['voiceModel/selina/selina.pth','voiceModel/selina/config.json']
# print('------\n'+str(CHOISE))
models = modelLoader()
@bot.on(GroupMessage)
async def characterSpeake(event: GroupMessage):
@ -113,14 +87,13 @@ def main(bot, master, logger):
text = str(event.message_chain)[len(str(event.message_chain).split("")[0]) + 1:]
speaker = str(event.message_chain).split("")[0].replace(prefix,"")
if speaker in characters:
text = await translate(text)
for i in models:
if speaker in i:
path = 'data/voices/' + random_str() + '.wav'
logger.info("语音生成_文本" + text)
logger.info("语音生成_模型:" + speaker + str(characters.get(speaker)[1]))
data = {"text": "[JA]" + text + "[JA]", "out": path, 'speaker': characters.get(speaker)[0],
'modelSelect': characters.get(speaker)[1]}
await voiceGenerate(data)
logger.info("语音生成_模型:" + speaker)
data = {"text": text, "out": path, 'speaker': speaker}
await superVG(data,"vits")
await bot.send(event, Voice(path=path))
return
if speaker in modelScope:
@ -156,19 +129,6 @@ def main(bot, master, logger):
except Exception as e:
logger.error(e)
@bot.on(GroupMessage)
async def characterSpeake(event: GroupMessage):
if "中文" in str(event.message_chain) and str(event.message_chain).split("中文")[0].replace(prefix,"") in characters and str(event.message_chain).startswith(prefix):
speaker = str(event.message_chain).split("中文")[0].replace(prefix,"")
text = str(event.message_chain).split("中文")[1]
path = f'data/voices/{random_str()}.wav'
logger.info("语音生成_文本" + text)
logger.info("语音生成_模型:" + speaker + str(characters.get(speaker)[1]))
data = {"text": "[ZH]" + text + "[ZH]", "out": path, 'speaker': characters.get(speaker)[0],
'modelSelect': characters.get(speaker)[1]}
await voiceGenerate(data)
await bot.send(event, Voice(path=path))
@bot.on(GroupMessage)
async def characterSpeake(event: GroupMessage):
@ -177,14 +137,15 @@ def main(bot, master, logger):
text = str(event.message_chain)[len(str(event.message_chain).split("日文")[0]) + 1:]
logger.info("语音生成_文本" + text)
if speaker in characters:
path = f'data/voices/{random_str()}.wav'
logger.info("语音生成_模型:" + speaker + str(characters.get(speaker)[1]))
data = {"text": f"[JA]{text}[JA]", "out": path, 'speaker': speaker,
'modelSelect': characters.get(speaker)[1]}
await voiceGenerate(data)
await bot.send(event, Voice(path=path))
for i in models:
if speaker in i:
path = 'data/voices/' + random_str() + '.wav'
logger.info("语音生成_文本" + text)
logger.info("语音生成_模型:" + speaker)
data = {"text": text, "out": path, 'speaker': speaker}
await superVG(data, "vits",urls="",langmode="<jp>")
await bot.send(event, Voice(path=path))
return
try:
sp1 = await fetch_FishTTS_ModelId(proxy, FishTTSAuthorization,speaker)
if sp1 is None or sp1 == "":
@ -204,8 +165,8 @@ def main(bot, master, logger):
#print(len(str(event.message_chain).replace(str(At(bot.qq)))))
try:
str1 = "vits可用角色如下\n"
for i in characters:
str1 += i + " |"
for i in models:
str1+=i+" |"
except:
str1 = ""
b1=[]

View File

@ -90,7 +90,7 @@ def get_label(text, label):
else:
return False, text
async def vG(tex,out,speakerID=2,modelSelect=['vits/voiceModel/nene/1374_epochsm.pth','vits/voiceModel/nene/config.json'] ):
def vG(tex,out,speakerID=2,modelSelect=['vits/voiceModel/nene/1374_epochsm.pth','vits/voiceModel/nene/config.json'] ):
if len(tex)>150:
tex='[JA]長すぎるああ、こんなに長い声..... んもう~[JA]'
@ -166,22 +166,19 @@ async def vG(tex,out,speakerID=2,modelSelect=['vits/voiceModel/nene/1374_epochsm
write(out_path, hps_ms.data.sampling_rate, audio)#将生成的语音文件写入本地
await change_sample_rate(out_path)
async def change_sample_rate(path,new_sample_rate=44100):
#wavfile = path # 提取音频文件名如“1.wav"
# new_file_name = wavfile.split('.')[0] + '_8k.wav' #此行代码可用于对转换后的文件进行重命名(如有需要)
signal, sr = librosa.load(path, sr=None) # 调用librosa载入音频
new_signal = librosa.resample(signal, orig_sr=sr, target_sr=new_sample_rate) # 调用librosa进行音频采样率转换
new_path = path # 指定输出音频的路径,音频文件与原音频同名
# new_path = os.path.join(new_dir_path, new_file_name) #若需要改名则启用此行代码
#print("?")
#print(new_path)
# librosa.output.write_wav(new_path, new_signal , new_sample_rate) #因版本问题,此方法可能用不了
soundfile.write(new_path, new_signal, new_sample_rate)
change_sample_rate(out_path)
def change_sample_rate(path,new_sample_rate=44100):
try:
signal, sr = librosa.load(path, sr=None)
if sr == new_sample_rate:
print("原始采样率与目标采样率相同,无需转换。")
return
new_signal = librosa.resample(signal, orig_sr=sr, target_sr=new_sample_rate)
new_path = path
soundfile.write(new_path, new_signal, new_sample_rate)
print(f"音频文件已保存为: {new_path},采样率为: {new_sample_rate}")
except Exception as e:
print(f"处理音频文件时发生错误: {e}")
def voice_conversion(sourcepath,speaker=0):

View File

@ -0,0 +1,143 @@
{
"train": {
"log_interval": 10,
"eval_interval": 100,
"seed": 1234,
"epochs": 10000,
"learning_rate": 0.0002,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 16,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 8192,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"training_files": "final_annotation_train.txt",
"validation_files": "final_annotation_val.txt",
"text_cleaners": [
"zh_ja_mixture_cleaners"
],
"max_wav_value": 32768.0,
"sampling_rate": 22050,
"filter_length": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null,
"add_blank": true,
"n_speakers": 3,
"cleaned_text": true
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
8,
8,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4
],
"n_layers_q": 3,
"use_spectral_norm": false,
"gin_channels": 256
},
"speakers": ["薄绿"],
"symbols": [
"_",
",",
".",
"!",
"?",
"-",
"~",
"\u2026",
"A",
"E",
"I",
"N",
"O",
"Q",
"U",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"r",
"s",
"t",
"u",
"v",
"w",
"y",
"z",
"\u0283",
"\u02a7",
"\u02a6",
"\u026f",
"\u0279",
"\u0259",
"\u0265",
"\u207c",
"\u02b0",
"`",
"\u2192",
"\u2193",
"\u2191",
" "
]
}