master
宇宙遨游 9 months ago
commit 2b4520a5f1

@ -0,0 +1,63 @@
import os
import sys
import ffmpeg
import torch
import warnings
#验证SDK token
from modelscope.hub.api import HubApi
api = HubApi()
api.login(os.getenv('token'))
#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('yuzhouaoyou/uvr')
warnings.filterwarnings("ignore")
os.system('pip install librosa==0.9.1')
os.system('pip install numpy==1.23.5')
#os.system('pip install flask')
sys.path.append(os.getcwd())
from infer.modules.uvr5.mdxnet import MDXNetDereverb
from infer.modules.uvr5.vr import AudioPre
def run_uvr(file):
out_path = 'opt'
func = AudioPre(agg=10,model_path=f'{model_dir}/HP5.pth',device='cuda:0' if torch.cuda.is_available() else 'cpu',is_half=False)
need_reformat = True
try:
info = ffmpeg.probe(file, cmd="ffprobe")
if info["streams"][0]["channels"] == 2 and info["streams"][0]["sample_rate"] == "44100":
need_reformat = False
except:
need_reformat = True
if need_reformat:
tmp_path = "%s/%s.reformatted.wav" % (
os.path.join(os.environ["TEMP"]),
os.path.basename(file),
)
os.system(
"ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
% (file, tmp_path)
)
file = tmp_path
func._path_audio_(file, out_path, out_path, 'mp3')
return '%s/instrument_%s_10.mp3' % (out_path,os.path.basename(file)), '%s/vocal_%s_10.mp3' % (out_path,os.path.basename(file))
import gradio as gr
def f(file):
i,v = run_uvr(file)
return [i,v]
demo = gr.Interface(
fn=f,
inputs=[gr.File()],
outputs=[gr.Files()],
title="UVR 人声分离",
)
demo.launch()

@ -0,0 +1,73 @@
import os
import traceback
import librosa
import numpy as np
import av
from io import BytesIO
def wav2(i, o, format):
inp = av.open(i, "rb")
if format == "m4a":
format = "mp4"
out = av.open(o, "wb", format=format)
if format == "ogg":
format = "libvorbis"
if format == "mp4":
format = "aac"
ostream = out.add_stream(format)
for frame in inp.decode(audio=0):
for p in ostream.encode(frame):
out.mux(p)
for p in ostream.encode(None):
out.mux(p)
out.close()
inp.close()
def audio2(i, o, format, sr):
inp = av.open(i, "rb")
out = av.open(o, "wb", format=format)
if format == "ogg":
format = "libvorbis"
if format == "f32le":
format = "pcm_f32le"
ostream = out.add_stream(format, channels=1)
ostream.sample_rate = sr
for frame in inp.decode(audio=0):
for p in ostream.encode(frame):
out.mux(p)
out.close()
inp.close()
def load_audio(file, sr):
file = (
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) # 防止小白拷路径头尾带了空格和"和回车
if os.path.exists(file) == False:
raise RuntimeError(
"You input a wrong audio path that does not exists, please fix it!"
)
try:
with open(file, "rb") as f:
with BytesIO() as out:
audio2(f, out, "f32le", sr)
return np.frombuffer(out.getvalue(), np.float32).flatten()
except AttributeError:
audio = file[1] / 32768.0
if len(audio.shape) == 2:
audio = np.mean(audio, -1)
return librosa.resample(audio, orig_sr=file[0], target_sr=16000)
except:
raise RuntimeError(traceback.format_exc())

@ -0,0 +1,459 @@
import copy
import math
from typing import Optional
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from infer.lib.infer_pack import commons, modules
from infer.lib.infer_pack.modules import LayerNorm
class Encoder(nn.Module):
def __init__(
self,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size=1,
p_dropout=0.0,
window_size=10,
**kwargs
):
super(Encoder, self).__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = int(n_layers)
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
self.drop = nn.Dropout(p_dropout)
self.attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
window_size=window_size,
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask):
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
zippep = zip(
self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
)
for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep:
y = attn_layers(x, x, attn_mask)
y = self.drop(y)
x = norm_layers_1(x + y)
y = ffn_layers(x, x_mask)
y = self.drop(y)
x = norm_layers_2(x + y)
x = x * x_mask
return x
class Decoder(nn.Module):
def __init__(
self,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size=1,
p_dropout=0.0,
proximal_bias=False,
proximal_init=True,
**kwargs
):
super(Decoder, self).__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.drop = nn.Dropout(p_dropout)
self.self_attn_layers = nn.ModuleList()
self.norm_layers_0 = nn.ModuleList()
self.encdec_attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.self_attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
proximal_bias=proximal_bias,
proximal_init=proximal_init,
)
)
self.norm_layers_0.append(LayerNorm(hidden_channels))
self.encdec_attn_layers.append(
MultiHeadAttention(
hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
causal=True,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask, h, h_mask):
"""
x: decoder input
h: encoder output
"""
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
device=x.device, dtype=x.dtype
)
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.self_attn_layers[i](x, x, self_attn_mask)
y = self.drop(y)
x = self.norm_layers_0[i](x + y)
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class MultiHeadAttention(nn.Module):
def __init__(
self,
channels,
out_channels,
n_heads,
p_dropout=0.0,
window_size=None,
heads_share=True,
block_length=None,
proximal_bias=False,
proximal_init=False,
):
super(MultiHeadAttention, self).__init__()
assert channels % n_heads == 0
self.channels = channels
self.out_channels = out_channels
self.n_heads = n_heads
self.p_dropout = p_dropout
self.window_size = window_size
self.heads_share = heads_share
self.block_length = block_length
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.attn = None
self.k_channels = channels // n_heads
self.conv_q = nn.Conv1d(channels, channels, 1)
self.conv_k = nn.Conv1d(channels, channels, 1)
self.conv_v = nn.Conv1d(channels, channels, 1)
self.conv_o = nn.Conv1d(channels, out_channels, 1)
self.drop = nn.Dropout(p_dropout)
if window_size is not None:
n_heads_rel = 1 if heads_share else n_heads
rel_stddev = self.k_channels**-0.5
self.emb_rel_k = nn.Parameter(
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
* rel_stddev
)
self.emb_rel_v = nn.Parameter(
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
* rel_stddev
)
nn.init.xavier_uniform_(self.conv_q.weight)
nn.init.xavier_uniform_(self.conv_k.weight)
nn.init.xavier_uniform_(self.conv_v.weight)
if proximal_init:
with torch.no_grad():
self.conv_k.weight.copy_(self.conv_q.weight)
self.conv_k.bias.copy_(self.conv_q.bias)
def forward(
self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None
):
q = self.conv_q(x)
k = self.conv_k(c)
v = self.conv_v(c)
x, _ = self.attention(q, k, v, mask=attn_mask)
x = self.conv_o(x)
return x
def attention(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: Optional[torch.Tensor] = None,
):
# reshape [b, d, t] -> [b, n_h, t, d_k]
b, d, t_s = key.size()
t_t = query.size(2)
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
if self.window_size is not None:
assert (
t_s == t_t
), "Relative attention is only available for self-attention."
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
rel_logits = self._matmul_with_relative_keys(
query / math.sqrt(self.k_channels), key_relative_embeddings
)
scores_local = self._relative_position_to_absolute_position(rel_logits)
scores = scores + scores_local
if self.proximal_bias:
assert t_s == t_t, "Proximal bias is only available for self-attention."
scores = scores + self._attention_bias_proximal(t_s).to(
device=scores.device, dtype=scores.dtype
)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e4)
if self.block_length is not None:
assert (
t_s == t_t
), "Local attention is only available for self-attention."
block_mask = (
torch.ones_like(scores)
.triu(-self.block_length)
.tril(self.block_length)
)
scores = scores.masked_fill(block_mask == 0, -1e4)
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
p_attn = self.drop(p_attn)
output = torch.matmul(p_attn, value)
if self.window_size is not None:
relative_weights = self._absolute_position_to_relative_position(p_attn)
value_relative_embeddings = self._get_relative_embeddings(
self.emb_rel_v, t_s
)
output = output + self._matmul_with_relative_values(
relative_weights, value_relative_embeddings
)
output = (
output.transpose(2, 3).contiguous().view(b, d, t_t)
) # [b, n_h, t_t, d_k] -> [b, d, t_t]
return output, p_attn
def _matmul_with_relative_values(self, x, y):
"""
x: [b, h, l, m]
y: [h or 1, m, d]
ret: [b, h, l, d]
"""
ret = torch.matmul(x, y.unsqueeze(0))
return ret
def _matmul_with_relative_keys(self, x, y):
"""
x: [b, h, l, d]
y: [h or 1, m, d]
ret: [b, h, l, m]
"""
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
return ret
def _get_relative_embeddings(self, relative_embeddings, length: int):
max_relative_position = 2 * self.window_size + 1
# Pad first before slice to avoid using cond ops.
pad_length: int = max(length - (self.window_size + 1), 0)
slice_start_position = max((self.window_size + 1) - length, 0)
slice_end_position = slice_start_position + 2 * length - 1
if pad_length > 0:
padded_relative_embeddings = F.pad(
relative_embeddings,
# commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
[0, 0, pad_length, pad_length, 0, 0],
)
else:
padded_relative_embeddings = relative_embeddings
used_relative_embeddings = padded_relative_embeddings[
:, slice_start_position:slice_end_position
]
return used_relative_embeddings
def _relative_position_to_absolute_position(self, x):
"""
x: [b, h, l, 2*l-1]
ret: [b, h, l, l]
"""
batch, heads, length, _ = x.size()
# Concat columns of pad to shift from relative to absolute indexing.
x = F.pad(
x,
# commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
[0, 1, 0, 0, 0, 0, 0, 0],
)
# Concat extra elements so to add up to shape (len+1, 2*len-1).
x_flat = x.view([batch, heads, length * 2 * length])
x_flat = F.pad(
x_flat,
# commons.convert_pad_shape([[0, 0], [0, 0], [0, int(length) - 1]])
[0, int(length) - 1, 0, 0, 0, 0],
)
# Reshape and slice out the padded elements.
x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
:, :, :length, length - 1 :
]
return x_final
def _absolute_position_to_relative_position(self, x):
"""
x: [b, h, l, l]
ret: [b, h, l, 2*l-1]
"""
batch, heads, length, _ = x.size()
# padd along column
x = F.pad(
x,
# commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, int(length) - 1]])
[0, int(length) - 1, 0, 0, 0, 0, 0, 0],
)
x_flat = x.view([batch, heads, int(length**2) + int(length * (length - 1))])
# add 0's in the beginning that will skew the elements after reshape
x_flat = F.pad(
x_flat,
# commons.convert_pad_shape([[0, 0], [0, 0], [int(length), 0]])
[length, 0, 0, 0, 0, 0],
)
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
return x_final
def _attention_bias_proximal(self, length: int):
"""Bias for self-attention to encourage attention to close positions.
Args:
length: an integer scalar.
Returns:
a Tensor with shape [1, 1, length, length]
"""
r = torch.arange(length, dtype=torch.float32)
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
class FFN(nn.Module):
def __init__(
self,
in_channels,
out_channels,
filter_channels,
kernel_size,
p_dropout=0.0,
activation: str = None,
causal=False,
):
super(FFN, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.activation = activation
self.causal = causal
self.is_activation = True if activation == "gelu" else False
# if causal:
# self.padding = self._causal_padding
# else:
# self.padding = self._same_padding
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
self.drop = nn.Dropout(p_dropout)
def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor:
if self.causal:
padding = self._causal_padding(x * x_mask)
else:
padding = self._same_padding(x * x_mask)
return padding
def forward(self, x: torch.Tensor, x_mask: torch.Tensor):
x = self.conv_1(self.padding(x, x_mask))
if self.is_activation:
x = x * torch.sigmoid(1.702 * x)
else:
x = torch.relu(x)
x = self.drop(x)
x = self.conv_2(self.padding(x, x_mask))
return x * x_mask
def _causal_padding(self, x):
if self.kernel_size == 1:
return x
pad_l: int = self.kernel_size - 1
pad_r: int = 0
# padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(
x,
# commons.convert_pad_shape(padding)
[pad_l, pad_r, 0, 0, 0, 0],
)
return x
def _same_padding(self, x):
if self.kernel_size == 1:
return x
pad_l: int = (self.kernel_size - 1) // 2
pad_r: int = self.kernel_size // 2
# padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(
x,
# commons.convert_pad_shape(padding)
[pad_l, pad_r, 0, 0, 0, 0],
)
return x

@ -0,0 +1,172 @@
from typing import List, Optional
import math
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)
# def convert_pad_shape(pad_shape):
# l = pad_shape[::-1]
# pad_shape = [item for sublist in l for item in sublist]
# return pad_shape
def kl_divergence(m_p, logs_p, m_q, logs_q):
"""KL(P||Q)"""
kl = (logs_q - logs_p) - 0.5
kl += (
0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
)
return kl
def rand_gumbel(shape):
"""Sample from the Gumbel distribution, protect from overflows."""
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
return -torch.log(-torch.log(uniform_samples))
def rand_gumbel_like(x):
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
return g
def slice_segments(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :, :segment_size])
for i in range(x.size(0)):
idx_str = ids_str[i]
idx_end = idx_str + segment_size
ret[i] = x[i, :, idx_str:idx_end]
return ret
def slice_segments2(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :segment_size])
for i in range(x.size(0)):
idx_str = ids_str[i]
idx_end = idx_str + segment_size
ret[i] = x[i, idx_str:idx_end]
return ret
def rand_slice_segments(x, x_lengths=None, segment_size=4):
b, d, t = x.size()
if x_lengths is None:
x_lengths = t
ids_str_max = x_lengths - segment_size + 1
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
ret = slice_segments(x, ids_str, segment_size)
return ret, ids_str
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
position = torch.arange(length, dtype=torch.float)
num_timescales = channels // 2
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
num_timescales - 1
)
inv_timescales = min_timescale * torch.exp(
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
)
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
signal = F.pad(signal, [0, 0, 0, channels % 2])
signal = signal.view(1, channels, length)
return signal
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return x + signal.to(dtype=x.dtype, device=x.device)
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
def subsequent_mask(length):
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
return mask
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a + input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
# def convert_pad_shape(pad_shape):
# l = pad_shape[::-1]
# pad_shape = [item for sublist in l for item in sublist]
# return pad_shape
def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
def shift_1d(x):
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
return x
def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
if max_length is None:
max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
return x.unsqueeze(0) < length.unsqueeze(1)
def generate_path(duration, mask):
"""
duration: [b, 1, t_x]
mask: [b, 1, t_y, t_x]
"""
device = duration.device
b, _, t_y, t_x = mask.shape
cum_duration = torch.cumsum(duration, -1)
cum_duration_flat = cum_duration.view(b * t_x)
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
path = path.view(b, t_x, t_y)
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
path = path.unsqueeze(1).transpose(2, 3) * mask
return path
def clip_grad_value_(parameters, clip_value, norm_type=2):
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
parameters = list(filter(lambda p: p.grad is not None, parameters))
norm_type = float(norm_type)
if clip_value is not None:
clip_value = float(clip_value)
total_norm = 0
for p in parameters:
param_norm = p.grad.data.norm(norm_type)
total_norm += param_norm.item() ** norm_type
if clip_value is not None:
p.grad.data.clamp_(min=-clip_value, max=clip_value)
total_norm = total_norm ** (1.0 / norm_type)
return total_norm

File diff suppressed because it is too large Load Diff

@ -0,0 +1,821 @@
import math
import logging
logger = logging.getLogger(__name__)
import numpy as np
import torch
from torch import nn
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
from torch.nn import functional as F
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
from infer.lib.infer_pack import attentions, commons, modules
from infer.lib.infer_pack.commons import get_padding, init_weights
class TextEncoder256(nn.Module):
def __init__(
self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths):
if pitch == None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x = self.encoder(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask
class TextEncoder768(nn.Module):
def __init__(
self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb_phone = nn.Linear(768, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths):
if pitch == None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x = self.encoder(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask
class ResidualCouplingBlock(nn.Module):
def __init__(
self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
n_flows=4,
gin_channels=0,
):
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.n_flows = n_flows
self.gin_channels = gin_channels
self.flows = nn.ModuleList()
for i in range(n_flows):
self.flows.append(
modules.ResidualCouplingLayer(
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=gin_channels,
mean_only=True,
)
)
self.flows.append(modules.Flip())
def forward(self, x, x_mask, g=None, reverse=False):
if not reverse:
for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse)
else:
for flow in reversed(self.flows):
x, _ = flow(x, x_mask, g=g, reverse=reverse)
return x
def remove_weight_norm(self):
for i in range(self.n_flows):
self.flows[i * 2].remove_weight_norm()
class PosteriorEncoder(nn.Module):
def __init__(
self,
in_channels,
out_channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0,
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
self.enc = modules.WN(
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=gin_channels,
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths, g=None):
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
x.dtype
)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
return z, m, logs, x_mask
def remove_weight_norm(self):
self.enc.remove_weight_norm()
class Generator(torch.nn.Module):
def __init__(
self,
initial_channel,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=0,
):
super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.conv_pre = Conv1d(
initial_channel, upsample_initial_channel, 7, 1, padding=3
)
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
self.ups.append(
weight_norm(
ConvTranspose1d(
upsample_initial_channel // (2**i),
upsample_initial_channel // (2 ** (i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
)
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(
zip(resblock_kernel_sizes, resblock_dilation_sizes)
):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
def forward(self, x, g=None):
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, modules.LRELU_SLOPE)
x = self.ups[i](x)
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
class SineGen(torch.nn.Module):
"""Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
flag_for_pulse=False)
samp_rate: sampling rate in Hz
harmonic_num: number of harmonic overtones (default 0)
sine_amp: amplitude of sine-wavefrom (default 0.1)
noise_std: std of Gaussian noise (default 0.003)
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0)
"""
def __init__(
self,
samp_rate,
harmonic_num=0,
sine_amp=0.1,
noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False,
):
super(SineGen, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
self.harmonic_num = harmonic_num
self.dim = self.harmonic_num + 1
self.sampling_rate = samp_rate
self.voiced_threshold = voiced_threshold
def _f02uv(self, f0):
# generate uv signal
uv = torch.ones_like(f0)
uv = uv * (f0 > self.voiced_threshold)
return uv
def forward(self, f0, upp):
"""sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim)
output uv: tensor(batchsize=1, length, 1)
"""
with torch.no_grad():
f0 = f0[:, None].transpose(1, 2)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
# fundamental component
f0_buf[:, :, 0] = f0[:, :, 0]
for idx in np.arange(self.harmonic_num):
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
idx + 2
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
tmp_over_one *= upp
tmp_over_one = F.interpolate(
tmp_over_one.transpose(2, 1),
scale_factor=upp,
mode="linear",
align_corners=True,
).transpose(2, 1)
rad_values = F.interpolate(
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(
2, 1
) #######
tmp_over_one %= 1
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin(
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
)
sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0)
uv = F.interpolate(
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module):
"""SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
harmonic_num: number of harmonic above F0 (default: 0)
sine_amp: amplitude of sine source signal (default: 0.1)
add_noise_std: std of additive Gaussian noise (default: 0.003)
note that amplitude of noise in unvoiced is decided
by sine_amp
voiced_threshold: threhold to set U/V given F0 (default: 0)
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
uv (batchsize, length, 1)
"""
def __init__(
self,
sampling_rate,
harmonic_num=0,
sine_amp=0.1,
add_noise_std=0.003,
voiced_threshod=0,
is_half=True,
):
super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp
self.noise_std = add_noise_std
self.is_half = is_half
# to produce sine waveforms
self.l_sin_gen = SineGen(
sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
)
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
def forward(self, x, upp=None):
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
if self.is_half:
sine_wavs = sine_wavs.half()
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
return sine_merge, None, None # noise, uv
class GeneratorNSF(torch.nn.Module):
def __init__(
self,
initial_channel,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels,
sr,
is_half=False,
):
super(GeneratorNSF, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
self.m_source = SourceModuleHnNSF(
sampling_rate=sr, harmonic_num=0, is_half=is_half
)
self.noise_convs = nn.ModuleList()
self.conv_pre = Conv1d(
initial_channel, upsample_initial_channel, 7, 1, padding=3
)
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
c_cur = upsample_initial_channel // (2 ** (i + 1))
self.ups.append(
weight_norm(
ConvTranspose1d(
upsample_initial_channel // (2**i),
upsample_initial_channel // (2 ** (i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
)
if i + 1 < len(upsample_rates):
stride_f0 = np.prod(upsample_rates[i + 1 :])
self.noise_convs.append(
Conv1d(
1,
c_cur,
kernel_size=stride_f0 * 2,
stride=stride_f0,
padding=stride_f0 // 2,
)
)
else:
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(
zip(resblock_kernel_sizes, resblock_dilation_sizes)
):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
self.upp = np.prod(upsample_rates)
def forward(self, x, f0, g=None):
har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2)
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, modules.LRELU_SLOPE)
x = self.ups[i](x)
x_source = self.noise_convs[i](har_source)
x = x + x_source
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
sr2sr = {
"32k": 32000,
"40k": 40000,
"48k": 48000,
}
class SynthesizerTrnMsNSFsidM(nn.Module):
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels,
sr,
version,
**kwargs,
):
super().__init__()
if type(sr) == type("strr"):
sr = sr2sr[sr]
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.gin_channels = gin_channels
# self.hop_length = hop_length#
self.spk_embed_dim = spk_embed_dim
if version == "v1":
self.enc_p = TextEncoder256(
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
)
else:
self.enc_p = TextEncoder768(
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
)
self.dec = GeneratorNSF(
inter_channels,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels,
sr=sr,
is_half=kwargs["is_half"],
)
self.enc_q = PosteriorEncoder(
spec_channels,
inter_channels,
hidden_channels,
5,
1,
16,
gin_channels=gin_channels,
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
self.speaker_map = None
logger.debug(
f"gin_channels: {gin_channels}, self.spk_embed_dim: {self.spk_embed_dim}"
)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def construct_spkmixmap(self, n_speaker):
self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
for i in range(n_speaker):
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
self.speaker_map = self.speaker_map.unsqueeze(0)
def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
g = g * self.speaker_map # [N, S, B, 1, H]
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
else:
g = g.unsqueeze(0)
g = self.emb_g(g).transpose(1, 2)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
return o
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__()
periods = [2, 3, 5, 7, 11, 17]
# periods = [3, 5, 7, 11, 17, 23, 37]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
discs = discs + [
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
]
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat):
y_d_rs = [] #
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
# for j in range(len(fmap_r)):
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
y_d_rs.append(y_d_r)
y_d_gs.append(y_d_g)
fmap_rs.append(fmap_r)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class MultiPeriodDiscriminatorV2(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminatorV2, self).__init__()
# periods = [2, 3, 5, 7, 11, 17]
periods = [2, 3, 5, 7, 11, 17, 23, 37]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
discs = discs + [
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
]
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat):
y_d_rs = [] #
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
# for j in range(len(fmap_r)):
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
y_d_rs.append(y_d_r)
y_d_gs.append(y_d_g)
fmap_rs.append(fmap_r)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList(
[
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
]
)
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
def forward(self, x):
fmap = []
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, modules.LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
self.use_spectral_norm = use_spectral_norm
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList(
[
norm_f(
Conv2d(
1,
32,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
32,
128,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
128,
512,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
512,
1024,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
1024,
1024,
(kernel_size, 1),
1,
padding=(get_padding(kernel_size, 1), 0),
)
),
]
)
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
def forward(self, x):
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, modules.LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap

@ -0,0 +1,615 @@
import copy
import math
from typing import Optional, Tuple
import numpy as np
import scipy
import torch
from torch import nn
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
from torch.nn import functional as F
from torch.nn.utils import remove_weight_norm, weight_norm
from infer.lib.infer_pack import commons
from infer.lib.infer_pack.commons import get_padding, init_weights
from infer.lib.infer_pack.transforms import piecewise_rational_quadratic_transform
LRELU_SLOPE = 0.1
class LayerNorm(nn.Module):
def __init__(self, channels, eps=1e-5):
super(LayerNorm, self).__init__()
self.channels = channels
self.eps = eps
self.gamma = nn.Parameter(torch.ones(channels))
self.beta = nn.Parameter(torch.zeros(channels))
def forward(self, x):
x = x.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
return x.transpose(1, -1)
class ConvReluNorm(nn.Module):
def __init__(
self,
in_channels,
hidden_channels,
out_channels,
kernel_size,
n_layers,
p_dropout,
):
super(ConvReluNorm, self).__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = float(p_dropout)
assert n_layers > 1, "Number of layers should be larger than 0."
self.conv_layers = nn.ModuleList()
self.norm_layers = nn.ModuleList()
self.conv_layers.append(
nn.Conv1d(
in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(float(p_dropout)))
for _ in range(n_layers - 1):
self.conv_layers.append(
nn.Conv1d(
hidden_channels,
hidden_channels,
kernel_size,
padding=kernel_size // 2,
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask):
x_org = x
for i in range(self.n_layers):
x = self.conv_layers[i](x * x_mask)
x = self.norm_layers[i](x)
x = self.relu_drop(x)
x = x_org + self.proj(x)
return x * x_mask
class DDSConv(nn.Module):
"""
Dialted and Depth-Separable Convolution
"""
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
super(DDSConv, self).__init__()
self.channels = channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = float(p_dropout)
self.drop = nn.Dropout(float(p_dropout))
self.convs_sep = nn.ModuleList()
self.convs_1x1 = nn.ModuleList()
self.norms_1 = nn.ModuleList()
self.norms_2 = nn.ModuleList()
for i in range(n_layers):
dilation = kernel_size**i
padding = (kernel_size * dilation - dilation) // 2
self.convs_sep.append(
nn.Conv1d(
channels,
channels,
kernel_size,
groups=channels,
dilation=dilation,
padding=padding,
)
)
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
self.norms_1.append(LayerNorm(channels))
self.norms_2.append(LayerNorm(channels))
def forward(self, x, x_mask, g: Optional[torch.Tensor] = None):
if g is not None:
x = x + g
for i in range(self.n_layers):
y = self.convs_sep[i](x * x_mask)
y = self.norms_1[i](y)
y = F.gelu(y)
y = self.convs_1x1[i](y)
y = self.norms_2[i](y)
y = F.gelu(y)
y = self.drop(y)
x = x + y
return x * x_mask
class WN(torch.nn.Module):
def __init__(
self,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0,
p_dropout=0,
):
super(WN, self).__init__()
assert kernel_size % 2 == 1
self.hidden_channels = hidden_channels
self.kernel_size = (kernel_size,)
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.p_dropout = float(p_dropout)
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
self.drop = nn.Dropout(float(p_dropout))
if gin_channels != 0:
cond_layer = torch.nn.Conv1d(
gin_channels, 2 * hidden_channels * n_layers, 1
)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
for i in range(n_layers):
dilation = dilation_rate**i
padding = int((kernel_size * dilation - dilation) / 2)
in_layer = torch.nn.Conv1d(
hidden_channels,
2 * hidden_channels,
kernel_size,
dilation=dilation,
padding=padding,
)
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
self.in_layers.append(in_layer)
# last one is not necessary
if i < n_layers - 1:
res_skip_channels = 2 * hidden_channels
else:
res_skip_channels = hidden_channels
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
self.res_skip_layers.append(res_skip_layer)
def forward(
self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None
):
output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None:
g = self.cond_layer(g)
for i, (in_layer, res_skip_layer) in enumerate(
zip(self.in_layers, self.res_skip_layers)
):
x_in = in_layer(x)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
else:
g_l = torch.zeros_like(x_in)
acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
acts = self.drop(acts)
res_skip_acts = res_skip_layer(acts)
if i < self.n_layers - 1:
res_acts = res_skip_acts[:, : self.hidden_channels, :]
x = (x + res_acts) * x_mask
output = output + res_skip_acts[:, self.hidden_channels :, :]
else:
output = output + res_skip_acts
return output * x_mask
def remove_weight_norm(self):
if self.gin_channels != 0:
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
def __prepare_scriptable__(self):
if self.gin_channels != 0:
for hook in self.cond_layer._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
return self
class ResBlock1(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
super(ResBlock1, self).__init__()
self.convs1 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[2],
padding=get_padding(kernel_size, dilation[2]),
)
),
]
)
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
]
)
self.convs2.apply(init_weights)
self.lrelu_slope = LRELU_SLOPE
def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, self.lrelu_slope)
if x_mask is not None:
xt = xt * x_mask
xt = c1(xt)
xt = F.leaky_relu(xt, self.lrelu_slope)
if x_mask is not None:
xt = xt * x_mask
xt = c2(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
def __prepare_scriptable__(self):
for l in self.convs1:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
for l in self.convs2:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
return self
class ResBlock2(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
super(ResBlock2, self).__init__()
self.convs = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]),
)
),
]
)
self.convs.apply(init_weights)
self.lrelu_slope = LRELU_SLOPE
def forward(self, x, x_mask: Optional[torch.Tensor] = None):
for c in self.convs:
xt = F.leaky_relu(x, self.lrelu_slope)
if x_mask is not None:
xt = xt * x_mask
xt = c(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
def __prepare_scriptable__(self):
for l in self.convs:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
return self
class Log(nn.Module):
def forward(
self,
x: torch.Tensor,
x_mask: torch.Tensor,
g: Optional[torch.Tensor] = None,
reverse: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
if not reverse:
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
logdet = torch.sum(-y, [1, 2])
return y, logdet
else:
x = torch.exp(x) * x_mask
return x
class Flip(nn.Module):
# torch.jit.script() Compiled functions \
# can't take variable number of arguments or \
# use keyword-only arguments with defaults
def forward(
self,
x: torch.Tensor,
x_mask: torch.Tensor,
g: Optional[torch.Tensor] = None,
reverse: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
x = torch.flip(x, [1])
if not reverse:
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
return x, logdet
else:
return x, torch.zeros([1], device=x.device)
class ElementwiseAffine(nn.Module):
def __init__(self, channels):
super(ElementwiseAffine, self).__init__()
self.channels = channels
self.m = nn.Parameter(torch.zeros(channels, 1))
self.logs = nn.Parameter(torch.zeros(channels, 1))
def forward(self, x, x_mask, reverse=False, **kwargs):
if not reverse:
y = self.m + torch.exp(self.logs) * x
y = y * x_mask
logdet = torch.sum(self.logs * x_mask, [1, 2])
return y, logdet
else:
x = (x - self.m) * torch.exp(-self.logs) * x_mask
return x
class ResidualCouplingLayer(nn.Module):
def __init__(
self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
p_dropout=0,
gin_channels=0,
mean_only=False,
):
assert channels % 2 == 0, "channels should be divisible by 2"
super(ResidualCouplingLayer, self).__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.half_channels = channels // 2
self.mean_only = mean_only
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
self.enc = WN(
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
p_dropout=float(p_dropout),
gin_channels=gin_channels,
)
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
self.post.weight.data.zero_()
self.post.bias.data.zero_()
def forward(
self,
x: torch.Tensor,
x_mask: torch.Tensor,
g: Optional[torch.Tensor] = None,
reverse: bool = False,
):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g)
stats = self.post(h) * x_mask
if not self.mean_only:
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
else:
m = stats
logs = torch.zeros_like(m)
if not reverse:
x1 = m + x1 * torch.exp(logs) * x_mask
x = torch.cat([x0, x1], 1)
logdet = torch.sum(logs, [1, 2])
return x, logdet
else:
x1 = (x1 - m) * torch.exp(-logs) * x_mask
x = torch.cat([x0, x1], 1)
return x, torch.zeros([1])
def remove_weight_norm(self):
self.enc.remove_weight_norm()
def __prepare_scriptable__(self):
for hook in self.enc._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(self.enc)
return self
class ConvFlow(nn.Module):
def __init__(
self,
in_channels,
filter_channels,
kernel_size,
n_layers,
num_bins=10,
tail_bound=5.0,
):
super(ConvFlow, self).__init__()
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.num_bins = num_bins
self.tail_bound = tail_bound
self.half_channels = in_channels // 2
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
self.proj = nn.Conv1d(
filter_channels, self.half_channels * (num_bins * 3 - 1), 1
)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(
self,
x: torch.Tensor,
x_mask: torch.Tensor,
g: Optional[torch.Tensor] = None,
reverse=False,
):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0)
h = self.convs(h, x_mask, g=g)
h = self.proj(h) * x_mask
b, c, t = x0.shape
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
self.filter_channels
)
unnormalized_derivatives = h[..., 2 * self.num_bins :]
x1, logabsdet = piecewise_rational_quadratic_transform(
x1,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=reverse,
tails="linear",
tail_bound=self.tail_bound,
)
x = torch.cat([x0, x1], 1) * x_mask
logdet = torch.sum(logabsdet * x_mask, [1, 2])
if not reverse:
return x, logdet
else:
return x

@ -0,0 +1,91 @@
import numpy as np
import pyworld
from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
class DioF0Predictor(F0Predictor):
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
self.hop_length = hop_length
self.f0_min = f0_min
self.f0_max = f0_max
self.sampling_rate = sampling_rate
def interpolate_f0(self, f0):
"""
对F0进行插值处理
"""
data = np.reshape(f0, (f0.size, 1))
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
vuv_vector[data > 0.0] = 1.0
vuv_vector[data <= 0.0] = 0.0
ip_data = data
frame_number = data.size
last_value = 0.0
for i in range(frame_number):
if data[i] <= 0.0:
j = i + 1
for j in range(i + 1, frame_number):
if data[j] > 0.0:
break
if j < frame_number - 1:
if last_value > 0.0:
step = (data[j] - data[i - 1]) / float(j - i)
for k in range(i, j):
ip_data[k] = data[i - 1] + step * (k - i + 1)
else:
for k in range(i, j):
ip_data[k] = data[j]
else:
for k in range(i, frame_number):
ip_data[k] = last_value
else:
ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
last_value = data[i]
return ip_data[:, 0], vuv_vector[:, 0]
def resize_f0(self, x, target_len):
source = np.array(x)
source[source < 0.001] = np.nan
target = np.interp(
np.arange(0, len(source) * target_len, len(source)) / target_len,
np.arange(0, len(source)),
source,
)
res = np.nan_to_num(target)
return res
def compute_f0(self, wav, p_len=None):
if p_len is None:
p_len = wav.shape[0] // self.hop_length
f0, t = pyworld.dio(
wav.astype(np.double),
fs=self.sampling_rate,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
frame_period=1000 * self.hop_length / self.sampling_rate,
)
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
def compute_f0_uv(self, wav, p_len=None):
if p_len is None:
p_len = wav.shape[0] // self.hop_length
f0, t = pyworld.dio(
wav.astype(np.double),
fs=self.sampling_rate,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
frame_period=1000 * self.hop_length / self.sampling_rate,
)
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
return self.interpolate_f0(self.resize_f0(f0, p_len))

@ -0,0 +1,16 @@
class F0Predictor(object):
def compute_f0(self, wav, p_len):
"""
input: wav:[signal_length]
p_len:int
output: f0:[signal_length//hop_length]
"""
pass
def compute_f0_uv(self, wav, p_len):
"""
input: wav:[signal_length]
p_len:int
output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
"""
pass

@ -0,0 +1,87 @@
import numpy as np
import pyworld
from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
class HarvestF0Predictor(F0Predictor):
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
self.hop_length = hop_length
self.f0_min = f0_min
self.f0_max = f0_max
self.sampling_rate = sampling_rate
def interpolate_f0(self, f0):
"""
对F0进行插值处理
"""
data = np.reshape(f0, (f0.size, 1))
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
vuv_vector[data > 0.0] = 1.0
vuv_vector[data <= 0.0] = 0.0
ip_data = data
frame_number = data.size
last_value = 0.0
for i in range(frame_number):
if data[i] <= 0.0:
j = i + 1
for j in range(i + 1, frame_number):
if data[j] > 0.0:
break
if j < frame_number - 1:
if last_value > 0.0:
step = (data[j] - data[i - 1]) / float(j - i)
for k in range(i, j):
ip_data[k] = data[i - 1] + step * (k - i + 1)
else:
for k in range(i, j):
ip_data[k] = data[j]
else:
for k in range(i, frame_number):
ip_data[k] = last_value
else:
ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
last_value = data[i]
return ip_data[:, 0], vuv_vector[:, 0]
def resize_f0(self, x, target_len):
source = np.array(x)
source[source < 0.001] = np.nan
target = np.interp(
np.arange(0, len(source) * target_len, len(source)) / target_len,
np.arange(0, len(source)),
source,
)
res = np.nan_to_num(target)
return res
def compute_f0(self, wav, p_len=None):
if p_len is None:
p_len = wav.shape[0] // self.hop_length
f0, t = pyworld.harvest(
wav.astype(np.double),
fs=self.sampling_rate,
f0_ceil=self.f0_max,
f0_floor=self.f0_min,
frame_period=1000 * self.hop_length / self.sampling_rate,
)
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
def compute_f0_uv(self, wav, p_len=None):
if p_len is None:
p_len = wav.shape[0] // self.hop_length
f0, t = pyworld.harvest(
wav.astype(np.double),
fs=self.sampling_rate,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
frame_period=1000 * self.hop_length / self.sampling_rate,
)
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
return self.interpolate_f0(self.resize_f0(f0, p_len))

@ -0,0 +1,98 @@
import numpy as np
import parselmouth
from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
class PMF0Predictor(F0Predictor):
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
self.hop_length = hop_length
self.f0_min = f0_min
self.f0_max = f0_max
self.sampling_rate = sampling_rate
def interpolate_f0(self, f0):
"""
对F0进行插值处理
"""
data = np.reshape(f0, (f0.size, 1))
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
vuv_vector[data > 0.0] = 1.0
vuv_vector[data <= 0.0] = 0.0
ip_data = data
frame_number = data.size
last_value = 0.0
for i in range(frame_number):
if data[i] <= 0.0:
j = i + 1
for j in range(i + 1, frame_number):
if data[j] > 0.0:
break
if j < frame_number - 1:
if last_value > 0.0:
step = (data[j] - data[i - 1]) / float(j - i)
for k in range(i, j):
ip_data[k] = data[i - 1] + step * (k - i + 1)
else:
for k in range(i, j):
ip_data[k] = data[j]
else:
for k in range(i, frame_number):
ip_data[k] = last_value
else:
ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
last_value = data[i]
return ip_data[:, 0], vuv_vector[:, 0]
def compute_f0(self, wav, p_len=None):
x = wav
if p_len is None:
p_len = x.shape[0] // self.hop_length
else:
assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
time_step = self.hop_length / self.sampling_rate * 1000
f0 = (
parselmouth.Sound(x, self.sampling_rate)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=self.f0_min,
pitch_ceiling=self.f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0, uv = self.interpolate_f0(f0)
return f0
def compute_f0_uv(self, wav, p_len=None):
x = wav
if p_len is None:
p_len = x.shape[0] // self.hop_length
else:
assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
time_step = self.hop_length / self.sampling_rate * 1000
f0 = (
parselmouth.Sound(x, self.sampling_rate)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=self.f0_min,
pitch_ceiling=self.f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0, uv = self.interpolate_f0(f0)
return f0, uv

@ -0,0 +1,149 @@
import librosa
import numpy as np
import onnxruntime
import soundfile
import logging
logger = logging.getLogger(__name__)
class ContentVec:
def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
logger.info("Load model(s) from {}".format(vec_path))
if device == "cpu" or device is None:
providers = ["CPUExecutionProvider"]
elif device == "cuda":
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
elif device == "dml":
providers = ["DmlExecutionProvider"]
else:
raise RuntimeError("Unsportted Device")
self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
def __call__(self, wav):
return self.forward(wav)
def forward(self, wav):
feats = wav
if feats.ndim == 2: # double channels
feats = feats.mean(-1)
assert feats.ndim == 1, feats.ndim
feats = np.expand_dims(np.expand_dims(feats, 0), 0)
onnx_input = {self.model.get_inputs()[0].name: feats}
logits = self.model.run(None, onnx_input)[0]
return logits.transpose(0, 2, 1)
def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
if f0_predictor == "pm":
from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
f0_predictor_object = PMF0Predictor(
hop_length=hop_length, sampling_rate=sampling_rate
)
elif f0_predictor == "harvest":
from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import (
HarvestF0Predictor,
)
f0_predictor_object = HarvestF0Predictor(
hop_length=hop_length, sampling_rate=sampling_rate
)
elif f0_predictor == "dio":
from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
f0_predictor_object = DioF0Predictor(
hop_length=hop_length, sampling_rate=sampling_rate
)
else:
raise Exception("Unknown f0 predictor")
return f0_predictor_object
class OnnxRVC:
def __init__(
self,
model_path,
sr=40000,
hop_size=512,
vec_path="vec-768-layer-12",
device="cpu",
):
vec_path = f"pretrained/{vec_path}.onnx"
self.vec_model = ContentVec(vec_path, device)
if device == "cpu" or device is None:
providers = ["CPUExecutionProvider"]
elif device == "cuda":
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
elif device == "dml":
providers = ["DmlExecutionProvider"]
else:
raise RuntimeError("Unsportted Device")
self.model = onnxruntime.InferenceSession(model_path, providers=providers)
self.sampling_rate = sr
self.hop_size = hop_size
def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
onnx_input = {
self.model.get_inputs()[0].name: hubert,
self.model.get_inputs()[1].name: hubert_length,
self.model.get_inputs()[2].name: pitch,
self.model.get_inputs()[3].name: pitchf,
self.model.get_inputs()[4].name: ds,
self.model.get_inputs()[5].name: rnd,
}
return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
def inference(
self,
raw_path,
sid,
f0_method="dio",
f0_up_key=0,
pad_time=0.5,
cr_threshold=0.02,
):
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0_predictor = get_f0_predictor(
f0_method,
hop_length=self.hop_size,
sampling_rate=self.sampling_rate,
threshold=cr_threshold,
)
wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
org_length = len(wav)
if org_length / sr > 50.0:
raise RuntimeError("Reached Max Length")
wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
wav16k = wav16k
hubert = self.vec_model(wav16k)
hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
hubert_length = hubert.shape[1]
pitchf = f0_predictor.compute_f0(wav, hubert_length)
pitchf = pitchf * 2 ** (f0_up_key / 12)
pitch = pitchf.copy()
f0_mel = 1127 * np.log(1 + pitch / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
pitch = np.rint(f0_mel).astype(np.int64)
pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
pitch = pitch.reshape(1, len(pitch))
ds = np.array([sid]).astype(np.int64)
rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
hubert_length = np.array([hubert_length]).astype(np.int64)
out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
return out_wav[0:org_length]

@ -0,0 +1,207 @@
import numpy as np
import torch
from torch.nn import functional as F
DEFAULT_MIN_BIN_WIDTH = 1e-3
DEFAULT_MIN_BIN_HEIGHT = 1e-3
DEFAULT_MIN_DERIVATIVE = 1e-3
def piecewise_rational_quadratic_transform(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails=None,
tail_bound=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
if tails is None:
spline_fn = rational_quadratic_spline
spline_kwargs = {}
else:
spline_fn = unconstrained_rational_quadratic_spline
spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
outputs, logabsdet = spline_fn(
inputs=inputs,
unnormalized_widths=unnormalized_widths,
unnormalized_heights=unnormalized_heights,
unnormalized_derivatives=unnormalized_derivatives,
inverse=inverse,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
**spline_kwargs
)
return outputs, logabsdet
def searchsorted(bin_locations, inputs, eps=1e-6):
bin_locations[..., -1] += eps
return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
def unconstrained_rational_quadratic_spline(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails="linear",
tail_bound=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
outside_interval_mask = ~inside_interval_mask
outputs = torch.zeros_like(inputs)
logabsdet = torch.zeros_like(inputs)
if tails == "linear":
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
constant = np.log(np.exp(1 - min_derivative) - 1)
unnormalized_derivatives[..., 0] = constant
unnormalized_derivatives[..., -1] = constant
outputs[outside_interval_mask] = inputs[outside_interval_mask]
logabsdet[outside_interval_mask] = 0
else:
raise RuntimeError("{} tails are not implemented.".format(tails))
(
outputs[inside_interval_mask],
logabsdet[inside_interval_mask],
) = rational_quadratic_spline(
inputs=inputs[inside_interval_mask],
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
inverse=inverse,
left=-tail_bound,
right=tail_bound,
bottom=-tail_bound,
top=tail_bound,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
)
return outputs, logabsdet
def rational_quadratic_spline(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
left=0.0,
right=1.0,
bottom=0.0,
top=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
if torch.min(inputs) < left or torch.max(inputs) > right:
raise ValueError("Input to a transform is not within its domain")
num_bins = unnormalized_widths.shape[-1]
if min_bin_width * num_bins > 1.0:
raise ValueError("Minimal bin width too large for the number of bins")
if min_bin_height * num_bins > 1.0:
raise ValueError("Minimal bin height too large for the number of bins")
widths = F.softmax(unnormalized_widths, dim=-1)
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
cumwidths = torch.cumsum(widths, dim=-1)
cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
cumwidths = (right - left) * cumwidths + left
cumwidths[..., 0] = left
cumwidths[..., -1] = right
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
heights = F.softmax(unnormalized_heights, dim=-1)
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
cumheights = torch.cumsum(heights, dim=-1)
cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
cumheights = (top - bottom) * cumheights + bottom
cumheights[..., 0] = bottom
cumheights[..., -1] = top
heights = cumheights[..., 1:] - cumheights[..., :-1]
if inverse:
bin_idx = searchsorted(cumheights, inputs)[..., None]
else:
bin_idx = searchsorted(cumwidths, inputs)[..., None]
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
delta = heights / widths
input_delta = delta.gather(-1, bin_idx)[..., 0]
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
input_heights = heights.gather(-1, bin_idx)[..., 0]
if inverse:
a = (inputs - input_cumheights) * (
input_derivatives + input_derivatives_plus_one - 2 * input_delta
) + input_heights * (input_delta - input_derivatives)
b = input_heights * input_derivatives - (inputs - input_cumheights) * (
input_derivatives + input_derivatives_plus_one - 2 * input_delta
)
c = -input_delta * (inputs - input_cumheights)
discriminant = b.pow(2) - 4 * a * c
assert (discriminant >= 0).all()
root = (2 * c) / (-b - torch.sqrt(discriminant))
outputs = root * input_bin_widths + input_cumwidths
theta_one_minus_theta = root * (1 - root)
denominator = input_delta + (
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta
)
derivative_numerator = input_delta.pow(2) * (
input_derivatives_plus_one * root.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - root).pow(2)
)
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, -logabsdet
else:
theta = (inputs - input_cumwidths) / input_bin_widths
theta_one_minus_theta = theta * (1 - theta)
numerator = input_heights * (
input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
)
denominator = input_delta + (
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta
)
outputs = input_cumheights + numerator / denominator
derivative_numerator = input_delta.pow(2) * (
input_derivatives_plus_one * theta.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - theta).pow(2)
)
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, logabsdet

@ -0,0 +1,163 @@
from io import BytesIO
import pickle
import time
import torch
from tqdm import tqdm
from collections import OrderedDict
def load_inputs(path, device, is_half=False):
parm = torch.load(path, map_location=torch.device("cpu"))
for key in parm.keys():
parm[key] = parm[key].to(device)
if is_half and parm[key].dtype == torch.float32:
parm[key] = parm[key].half()
elif not is_half and parm[key].dtype == torch.float16:
parm[key] = parm[key].float()
return parm
def benchmark(
model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False
):
parm = load_inputs(inputs_path, device, is_half)
total_ts = 0.0
bar = tqdm(range(epoch))
for i in bar:
start_time = time.perf_counter()
o = model(**parm)
total_ts += time.perf_counter() - start_time
print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}")
def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False):
benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half)
def to_jit_model(
model_path,
model_type: str,
mode: str = "trace",
inputs_path: str = None,
device=torch.device("cpu"),
is_half=False,
):
model = None
if model_type.lower() == "synthesizer":
from .get_synthesizer import get_synthesizer
model, _ = get_synthesizer(model_path, device)
model.forward = model.infer
elif model_type.lower() == "rmvpe":
from .get_rmvpe import get_rmvpe
model = get_rmvpe(model_path, device)
elif model_type.lower() == "hubert":
from .get_hubert import get_hubert_model
model = get_hubert_model(model_path, device)
model.forward = model.infer
else:
raise ValueError(f"No model type named {model_type}")
model = model.eval()
model = model.half() if is_half else model.float()
if mode == "trace":
assert not inputs_path
inputs = load_inputs(inputs_path, device, is_half)
model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
elif mode == "script":
model_jit = torch.jit.script(model)
model_jit.to(device)
model_jit = model_jit.half() if is_half else model_jit.float()
# model = model.half() if is_half else model.float()
return (model, model_jit)
def export(
model: torch.nn.Module,
mode: str = "trace",
inputs: dict = None,
device=torch.device("cpu"),
is_half: bool = False,
) -> dict:
model = model.half() if is_half else model.float()
model.eval()
if mode == "trace":
assert inputs is not None
model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
elif mode == "script":
model_jit = torch.jit.script(model)
model_jit.to(device)
model_jit = model_jit.half() if is_half else model_jit.float()
buffer = BytesIO()
# model_jit=model_jit.cpu()
torch.jit.save(model_jit, buffer)
del model_jit
cpt = OrderedDict()
cpt["model"] = buffer.getvalue()
cpt["is_half"] = is_half
return cpt
def load(path: str):
with open(path, "rb") as f:
return pickle.load(f)
def save(ckpt: dict, save_path: str):
with open(save_path, "wb") as f:
pickle.dump(ckpt, f)
def rmvpe_jit_export(
model_path: str,
mode: str = "script",
inputs_path: str = None,
save_path: str = None,
device=torch.device("cpu"),
is_half=False,
):
if not save_path:
save_path = model_path.rstrip(".pth")
save_path += ".half.jit" if is_half else ".jit"
if "cuda" in str(device) and ":" not in str(device):
device = torch.device("cuda:0")
from .get_rmvpe import get_rmvpe
model = get_rmvpe(model_path, device)
inputs = None
if mode == "trace":
inputs = load_inputs(inputs_path, device, is_half)
ckpt = export(model, mode, inputs, device, is_half)
ckpt["device"] = str(device)
save(ckpt, save_path)
return ckpt
def synthesizer_jit_export(
model_path: str,
mode: str = "script",
inputs_path: str = None,
save_path: str = None,
device=torch.device("cpu"),
is_half=False,
):
if not save_path:
save_path = model_path.rstrip(".pth")
save_path += ".half.jit" if is_half else ".jit"
if "cuda" in str(device) and ":" not in str(device):
device = torch.device("cuda:0")
from .get_synthesizer import get_synthesizer
model, cpt = get_synthesizer(model_path, device)
assert isinstance(cpt, dict)
model.forward = model.infer
inputs = None
if mode == "trace":
inputs = load_inputs(inputs_path, device, is_half)
ckpt = export(model, mode, inputs, device, is_half)
cpt.pop("weight")
cpt["model"] = ckpt["model"]
cpt["device"] = device
save(cpt, save_path)
return cpt

@ -0,0 +1,342 @@
import math
import random
from typing import Optional, Tuple
from fairseq.checkpoint_utils import load_model_ensemble_and_task
import numpy as np
import torch
import torch.nn.functional as F
# from fairseq.data.data_utils import compute_mask_indices
from fairseq.utils import index_put
# @torch.jit.script
def pad_to_multiple(x, multiple, dim=-1, value=0):
# Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41
if x is None:
return None, 0
tsz = x.size(dim)
m = tsz / multiple
remainder = math.ceil(m) * multiple - tsz
if int(tsz % multiple) == 0:
return x, 0
pad_offset = (0,) * (-1 - dim) * 2
return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder
def extract_features(
self,
x,
padding_mask=None,
tgt_layer=None,
min_layer=0,
):
if padding_mask is not None:
x = index_put(x, padding_mask, 0)
x_conv = self.pos_conv(x.transpose(1, 2))
x_conv = x_conv.transpose(1, 2)
x = x + x_conv
if not self.layer_norm_first:
x = self.layer_norm(x)
# pad to the sequence length dimension
x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0)
if pad_length > 0 and padding_mask is None:
padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool)
padding_mask[:, -pad_length:] = True
else:
padding_mask, _ = pad_to_multiple(
padding_mask, self.required_seq_len_multiple, dim=-1, value=True
)
x = F.dropout(x, p=self.dropout, training=self.training)
# B x T x C -> T x B x C
x = x.transpose(0, 1)
layer_results = []
r = None
for i, layer in enumerate(self.layers):
dropout_probability = np.random.random() if self.layerdrop > 0 else 1
if not self.training or (dropout_probability > self.layerdrop):
x, (z, lr) = layer(
x, self_attn_padding_mask=padding_mask, need_weights=False
)
if i >= min_layer:
layer_results.append((x, z, lr))
if i == tgt_layer:
r = x
break
if r is not None:
x = r
# T x B x C -> B x T x C
x = x.transpose(0, 1)
# undo paddding
if pad_length > 0:
x = x[:, :-pad_length]
def undo_pad(a, b, c):
return (
a[:-pad_length],
b[:-pad_length] if b is not None else b,
c[:-pad_length],
)
layer_results = [undo_pad(*u) for u in layer_results]
return x, layer_results
def compute_mask_indices(
shape: Tuple[int, int],
padding_mask: Optional[torch.Tensor],
mask_prob: float,
mask_length: int,
mask_type: str = "static",
mask_other: float = 0.0,
min_masks: int = 0,
no_overlap: bool = False,
min_space: int = 0,
require_same_masks: bool = True,
mask_dropout: float = 0.0,
) -> torch.Tensor:
"""
Computes random mask spans for a given shape
Args:
shape: the the shape for which to compute masks.
should be of size 2 where first element is batch size and 2nd is timesteps
padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
mask_type: how to compute mask lengths
static = fixed size
uniform = sample from uniform distribution [mask_other, mask_length*2]
normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
poisson = sample from possion distribution with lambda = mask length
min_masks: minimum number of masked spans
no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
mask_dropout: randomly dropout this percentage of masks in each example
"""
bsz, all_sz = shape
mask = torch.full((bsz, all_sz), False)
all_num_mask = int(
# add a random number for probabilistic rounding
mask_prob * all_sz / float(mask_length)
+ torch.rand([1]).item()
)
all_num_mask = max(min_masks, all_num_mask)
mask_idcs = []
for i in range(bsz):
if padding_mask is not None:
sz = all_sz - padding_mask[i].long().sum().item()
num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand())
num_mask = max(min_masks, num_mask)
else:
sz = all_sz
num_mask = all_num_mask
if mask_type == "static":
lengths = torch.full([num_mask], mask_length)
elif mask_type == "uniform":
lengths = torch.randint(mask_other, mask_length * 2 + 1, size=[num_mask])
elif mask_type == "normal":
lengths = torch.normal(mask_length, mask_other, size=[num_mask])
lengths = [max(1, int(round(x))) for x in lengths]
else:
raise Exception("unknown mask selection " + mask_type)
if sum(lengths) == 0:
lengths[0] = min(mask_length, sz - 1)
if no_overlap:
mask_idc = []
def arrange(s, e, length, keep_length):
span_start = torch.randint(low=s, high=e - length, size=[1]).item()
mask_idc.extend(span_start + i for i in range(length))
new_parts = []
if span_start - s - min_space >= keep_length:
new_parts.append((s, span_start - min_space + 1))
if e - span_start - length - min_space > keep_length:
new_parts.append((span_start + length + min_space, e))
return new_parts
parts = [(0, sz)]
min_length = min(lengths)
for length in sorted(lengths, reverse=True):
t = [e - s if e - s >= length + min_space else 0 for s, e in parts]
lens = torch.asarray(t, dtype=torch.int)
l_sum = torch.sum(lens)
if l_sum == 0:
break
probs = lens / torch.sum(lens)
c = torch.multinomial(probs.float(), len(parts)).item()
s, e = parts.pop(c)
parts.extend(arrange(s, e, length, min_length))
mask_idc = torch.asarray(mask_idc)
else:
min_len = min(lengths)
if sz - min_len <= num_mask:
min_len = sz - num_mask - 1
mask_idc = torch.asarray(
random.sample([i for i in range(sz - min_len)], num_mask)
)
mask_idc = torch.asarray(
[
mask_idc[j] + offset
for j in range(len(mask_idc))
for offset in range(lengths[j])
]
)
mask_idcs.append(torch.unique(mask_idc[mask_idc < sz]))
min_len = min([len(m) for m in mask_idcs])
for i, mask_idc in enumerate(mask_idcs):
if isinstance(mask_idc, torch.Tensor):
mask_idc = torch.asarray(mask_idc, dtype=torch.float)
if len(mask_idc) > min_len and require_same_masks:
mask_idc = torch.asarray(
random.sample([i for i in range(mask_idc)], min_len)
)
if mask_dropout > 0:
num_holes = int(round(len(mask_idc) * mask_dropout))
mask_idc = torch.asarray(
random.sample([i for i in range(mask_idc)], len(mask_idc) - num_holes)
)
mask[i, mask_idc.int()] = True
return mask
def apply_mask(self, x, padding_mask, target_list):
B, T, C = x.shape
torch.zeros_like(x)
if self.mask_prob > 0:
mask_indices = compute_mask_indices(
(B, T),
padding_mask,
self.mask_prob,
self.mask_length,
self.mask_selection,
self.mask_other,
min_masks=2,
no_overlap=self.no_mask_overlap,
min_space=self.mask_min_space,
)
mask_indices = mask_indices.to(x.device)
x[mask_indices] = self.mask_emb
else:
mask_indices = None
if self.mask_channel_prob > 0:
mask_channel_indices = compute_mask_indices(
(B, C),
None,
self.mask_channel_prob,
self.mask_channel_length,
self.mask_channel_selection,
self.mask_channel_other,
no_overlap=self.no_mask_channel_overlap,
min_space=self.mask_channel_min_space,
)
mask_channel_indices = (
mask_channel_indices.to(x.device).unsqueeze(1).expand(-1, T, -1)
)
x[mask_channel_indices] = 0
return x, mask_indices
def get_hubert_model(
model_path="assets/hubert/hubert_base.pt", device=torch.device("cpu")
):
models, _, _ = load_model_ensemble_and_task(
[model_path],
suffix="",
)
hubert_model = models[0]
hubert_model = hubert_model.to(device)
def _apply_mask(x, padding_mask, target_list):
return apply_mask(hubert_model, x, padding_mask, target_list)
hubert_model.apply_mask = _apply_mask
def _extract_features(
x,
padding_mask=None,
tgt_layer=None,
min_layer=0,
):
return extract_features(
hubert_model.encoder,
x,
padding_mask=padding_mask,
tgt_layer=tgt_layer,
min_layer=min_layer,
)
hubert_model.encoder.extract_features = _extract_features
hubert_model._forward = hubert_model.forward
def hubert_extract_features(
self,
source: torch.Tensor,
padding_mask: Optional[torch.Tensor] = None,
mask: bool = False,
ret_conv: bool = False,
output_layer: Optional[int] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
res = self._forward(
source,
padding_mask=padding_mask,
mask=mask,
features_only=True,
output_layer=output_layer,
)
feature = res["features"] if ret_conv else res["x"]
return feature, res["padding_mask"]
def _hubert_extract_features(
source: torch.Tensor,
padding_mask: Optional[torch.Tensor] = None,
mask: bool = False,
ret_conv: bool = False,
output_layer: Optional[int] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
return hubert_extract_features(
hubert_model, source, padding_mask, mask, ret_conv, output_layer
)
hubert_model.extract_features = _hubert_extract_features
def infer(source, padding_mask, output_layer: torch.Tensor):
output_layer = output_layer.item()
logits = hubert_model.extract_features(
source=source, padding_mask=padding_mask, output_layer=output_layer
)
feats = hubert_model.final_proj(logits[0]) if output_layer == 9 else logits[0]
return feats
hubert_model.infer = infer
# hubert_model.forward=infer
# hubert_model.forward
return hubert_model

@ -0,0 +1,12 @@
import torch
def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
from infer.lib.rmvpe import E2E
model = E2E(4, 1, (2, 2))
ckpt = torch.load(model_path, map_location=device)
model.load_state_dict(ckpt)
model.eval()
model = model.to(device)
return model

@ -0,0 +1,38 @@
import torch
def get_synthesizer(pth_path, device=torch.device("cpu")):
from infer.lib.infer_pack.models import (
SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
SynthesizerTrnMs768NSFsid,
SynthesizerTrnMs768NSFsid_nono,
)
cpt = torch.load(pth_path, map_location=torch.device("cpu"))
# tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
if_f0 = cpt.get("f0", 1)
version = cpt.get("version", "v1")
if version == "v1":
if if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif version == "v2":
if if_f0 == 1:
net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False)
else:
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
del net_g.enc_q
# net_g.forward = net_g.infer
# ckpt = {}
# ckpt["config"] = cpt["config"]
# ckpt["f0"] = if_f0
# ckpt["version"] = version
# ckpt["info"] = cpt.get("info", "0epoch")
net_g.load_state_dict(cpt["weight"], strict=False)
net_g = net_g.float()
net_g.eval().to(device)
net_g.remove_weight_norm()
return net_g, cpt

@ -0,0 +1,670 @@
from io import BytesIO
import os
from typing import List, Optional, Tuple
import numpy as np
import torch
from infer.lib import jit
try:
# Fix "Torch not compiled with CUDA enabled"
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
if torch.xpu.is_available():
from infer.modules.ipex import ipex_init
ipex_init()
except Exception: # pylint: disable=broad-exception-caught
pass
import torch.nn as nn
import torch.nn.functional as F
from librosa.util import normalize, pad_center, tiny
from scipy.signal import get_window
import logging
logger = logging.getLogger(__name__)
class STFT(torch.nn.Module):
def __init__(
self, filter_length=1024, hop_length=512, win_length=None, window="hann"
):
"""
This module implements an STFT using 1D convolution and 1D transpose convolutions.
This is a bit tricky so there are some cases that probably won't work as working
out the same sizes before and after in all overlap add setups is tough. Right now,
this code should work with hop lengths that are half the filter length (50% overlap
between frames).
Keyword Arguments:
filter_length {int} -- Length of filters used (default: {1024})
hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512})
win_length {[type]} -- Length of the window function applied to each frame (if not specified, it
equals the filter length). (default: {None})
window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris)
(default: {'hann'})
"""
super(STFT, self).__init__()
self.filter_length = filter_length
self.hop_length = hop_length
self.win_length = win_length if win_length else filter_length
self.window = window
self.forward_transform = None
self.pad_amount = int(self.filter_length / 2)
fourier_basis = np.fft.fft(np.eye(self.filter_length))
cutoff = int((self.filter_length / 2 + 1))
fourier_basis = np.vstack(
[np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
)
forward_basis = torch.FloatTensor(fourier_basis)
inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis))
assert filter_length >= self.win_length
# get window and zero center pad it to filter_length
fft_window = get_window(window, self.win_length, fftbins=True)
fft_window = pad_center(fft_window, size=filter_length)
fft_window = torch.from_numpy(fft_window).float()
# window the bases
forward_basis *= fft_window
inverse_basis = (inverse_basis.T * fft_window).T
self.register_buffer("forward_basis", forward_basis.float())
self.register_buffer("inverse_basis", inverse_basis.float())
self.register_buffer("fft_window", fft_window.float())
def transform(self, input_data, return_phase=False):
"""Take input data (audio) to STFT domain.
Arguments:
input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
Returns:
magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
num_frequencies, num_frames)
phase {tensor} -- Phase of STFT with shape (num_batch,
num_frequencies, num_frames)
"""
input_data = F.pad(
input_data,
(self.pad_amount, self.pad_amount),
mode="reflect",
)
forward_transform = input_data.unfold(
1, self.filter_length, self.hop_length
).permute(0, 2, 1)
forward_transform = torch.matmul(self.forward_basis, forward_transform)
cutoff = int((self.filter_length / 2) + 1)
real_part = forward_transform[:, :cutoff, :]
imag_part = forward_transform[:, cutoff:, :]
magnitude = torch.sqrt(real_part**2 + imag_part**2)
if return_phase:
phase = torch.atan2(imag_part.data, real_part.data)
return magnitude, phase
else:
return magnitude
def inverse(self, magnitude, phase):
"""Call the inverse STFT (iSTFT), given magnitude and phase tensors produced
by the ```transform``` function.
Arguments:
magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
num_frequencies, num_frames)
phase {tensor} -- Phase of STFT with shape (num_batch,
num_frequencies, num_frames)
Returns:
inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of
shape (num_batch, num_samples)
"""
cat = torch.cat(
[magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
)
fold = torch.nn.Fold(
output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length),
kernel_size=(1, self.filter_length),
stride=(1, self.hop_length),
)
inverse_transform = torch.matmul(self.inverse_basis, cat)
inverse_transform = fold(inverse_transform)[
:, 0, 0, self.pad_amount : -self.pad_amount
]
window_square_sum = (
self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0)
)
window_square_sum = fold(window_square_sum)[
:, 0, 0, self.pad_amount : -self.pad_amount
]
inverse_transform /= window_square_sum
return inverse_transform
def forward(self, input_data):
"""Take input data (audio) to STFT domain and then back to audio.
Arguments:
input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
Returns:
reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of
shape (num_batch, num_samples)
"""
self.magnitude, self.phase = self.transform(input_data, return_phase=True)
reconstruction = self.inverse(self.magnitude, self.phase)
return reconstruction
from time import time as ttime
class BiGRU(nn.Module):
def __init__(self, input_features, hidden_features, num_layers):
super(BiGRU, self).__init__()
self.gru = nn.GRU(
input_features,
hidden_features,
num_layers=num_layers,
batch_first=True,
bidirectional=True,
)
def forward(self, x):
return self.gru(x)[0]
class ConvBlockRes(nn.Module):
def __init__(self, in_channels, out_channels, momentum=0.01):
super(ConvBlockRes, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
bias=False,
),
nn.BatchNorm2d(out_channels, momentum=momentum),
nn.ReLU(),
nn.Conv2d(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
bias=False,
),
nn.BatchNorm2d(out_channels, momentum=momentum),
nn.ReLU(),
)
# self.shortcut:Optional[nn.Module] = None
if in_channels != out_channels:
self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
def forward(self, x: torch.Tensor):
if not hasattr(self, "shortcut"):
return self.conv(x) + x
else:
return self.conv(x) + self.shortcut(x)
class Encoder(nn.Module):
def __init__(
self,
in_channels,
in_size,
n_encoders,
kernel_size,
n_blocks,
out_channels=16,
momentum=0.01,
):
super(Encoder, self).__init__()
self.n_encoders = n_encoders
self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
self.layers = nn.ModuleList()
self.latent_channels = []
for i in range(self.n_encoders):
self.layers.append(
ResEncoderBlock(
in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
)
)
self.latent_channels.append([out_channels, in_size])
in_channels = out_channels
out_channels *= 2
in_size //= 2
self.out_size = in_size
self.out_channel = out_channels
def forward(self, x: torch.Tensor):
concat_tensors: List[torch.Tensor] = []
x = self.bn(x)
for i, layer in enumerate(self.layers):
t, x = layer(x)
concat_tensors.append(t)
return x, concat_tensors
class ResEncoderBlock(nn.Module):
def __init__(
self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
):
super(ResEncoderBlock, self).__init__()
self.n_blocks = n_blocks
self.conv = nn.ModuleList()
self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
for i in range(n_blocks - 1):
self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
self.kernel_size = kernel_size
if self.kernel_size is not None:
self.pool = nn.AvgPool2d(kernel_size=kernel_size)
def forward(self, x):
for i, conv in enumerate(self.conv):
x = conv(x)
if self.kernel_size is not None:
return x, self.pool(x)
else:
return x
class Intermediate(nn.Module): #
def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
super(Intermediate, self).__init__()
self.n_inters = n_inters
self.layers = nn.ModuleList()
self.layers.append(
ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
)
for i in range(self.n_inters - 1):
self.layers.append(
ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
)
def forward(self, x):
for i, layer in enumerate(self.layers):
x = layer(x)
return x
class ResDecoderBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
super(ResDecoderBlock, self).__init__()
out_padding = (0, 1) if stride == (1, 2) else (1, 1)
self.n_blocks = n_blocks
self.conv1 = nn.Sequential(
nn.ConvTranspose2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=(3, 3),
stride=stride,
padding=(1, 1),
output_padding=out_padding,
bias=False,
),
nn.BatchNorm2d(out_channels, momentum=momentum),
nn.ReLU(),
)
self.conv2 = nn.ModuleList()
self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
for i in range(n_blocks - 1):
self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
def forward(self, x, concat_tensor):
x = self.conv1(x)
x = torch.cat((x, concat_tensor), dim=1)
for i, conv2 in enumerate(self.conv2):
x = conv2(x)
return x
class Decoder(nn.Module):
def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
super(Decoder, self).__init__()
self.layers = nn.ModuleList()
self.n_decoders = n_decoders
for i in range(self.n_decoders):
out_channels = in_channels // 2
self.layers.append(
ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
)
in_channels = out_channels
def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]):
for i, layer in enumerate(self.layers):
x = layer(x, concat_tensors[-1 - i])
return x
class DeepUnet(nn.Module):
def __init__(
self,
kernel_size,
n_blocks,
en_de_layers=5,
inter_layers=4,
in_channels=1,
en_out_channels=16,
):
super(DeepUnet, self).__init__()
self.encoder = Encoder(
in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
)
self.intermediate = Intermediate(
self.encoder.out_channel // 2,
self.encoder.out_channel,
inter_layers,
n_blocks,
)
self.decoder = Decoder(
self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x, concat_tensors = self.encoder(x)
x = self.intermediate(x)
x = self.decoder(x, concat_tensors)
return x
class E2E(nn.Module):
def __init__(
self,
n_blocks,
n_gru,
kernel_size,
en_de_layers=5,
inter_layers=4,
in_channels=1,
en_out_channels=16,
):
super(E2E, self).__init__()
self.unet = DeepUnet(
kernel_size,
n_blocks,
en_de_layers,
inter_layers,
in_channels,
en_out_channels,
)
self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
if n_gru:
self.fc = nn.Sequential(
BiGRU(3 * 128, 256, n_gru),
nn.Linear(512, 360),
nn.Dropout(0.25),
nn.Sigmoid(),
)
else:
self.fc = nn.Sequential(
nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
)
def forward(self, mel):
# print(mel.shape)
mel = mel.transpose(-1, -2).unsqueeze(1)
x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
x = self.fc(x)
# print(x.shape)
return x
from librosa.filters import mel
class MelSpectrogram(torch.nn.Module):
def __init__(
self,
is_half,
n_mel_channels,
sampling_rate,
win_length,
hop_length,
n_fft=None,
mel_fmin=0,
mel_fmax=None,
clamp=1e-5,
):
super().__init__()
n_fft = win_length if n_fft is None else n_fft
self.hann_window = {}
mel_basis = mel(
sr=sampling_rate,
n_fft=n_fft,
n_mels=n_mel_channels,
fmin=mel_fmin,
fmax=mel_fmax,
htk=True,
)
mel_basis = torch.from_numpy(mel_basis).float()
self.register_buffer("mel_basis", mel_basis)
self.n_fft = win_length if n_fft is None else n_fft
self.hop_length = hop_length
self.win_length = win_length
self.sampling_rate = sampling_rate
self.n_mel_channels = n_mel_channels
self.clamp = clamp
self.is_half = is_half
def forward(self, audio, keyshift=0, speed=1, center=True):
factor = 2 ** (keyshift / 12)
n_fft_new = int(np.round(self.n_fft * factor))
win_length_new = int(np.round(self.win_length * factor))
hop_length_new = int(np.round(self.hop_length * speed))
keyshift_key = str(keyshift) + "_" + str(audio.device)
if keyshift_key not in self.hann_window:
self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
audio.device
)
if "privateuseone" in str(audio.device):
if not hasattr(self, "stft"):
self.stft = STFT(
filter_length=n_fft_new,
hop_length=hop_length_new,
win_length=win_length_new,
window="hann",
).to(audio.device)
magnitude = self.stft.transform(audio)
else:
fft = torch.stft(
audio,
n_fft=n_fft_new,
hop_length=hop_length_new,
win_length=win_length_new,
window=self.hann_window[keyshift_key],
center=center,
return_complex=True,
)
magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
if keyshift != 0:
size = self.n_fft // 2 + 1
resize = magnitude.size(1)
if resize < size:
magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
mel_output = torch.matmul(self.mel_basis, magnitude)
if self.is_half == True:
mel_output = mel_output.half()
log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
return log_mel_spec
class RMVPE:
def __init__(self, model_path: str, is_half, device=None, use_jit=False):
self.resample_kernel = {}
self.resample_kernel = {}
self.is_half = is_half
if device is None:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.device = device
self.mel_extractor = MelSpectrogram(
is_half, 128, 16000, 1024, 160, None, 30, 8000
).to(device)
if "privateuseone" in str(device):
import onnxruntime as ort
ort_session = ort.InferenceSession(
"%s/rmvpe.onnx" % os.environ["rmvpe_root"],
providers=["DmlExecutionProvider"],
)
self.model = ort_session
else:
if str(self.device) == "cuda":
self.device = torch.device("cuda:0")
def get_jit_model():
jit_model_path = model_path.rstrip(".pth")
jit_model_path += ".half.jit" if is_half else ".jit"
reload = False
if os.path.exists(jit_model_path):
ckpt = jit.load(jit_model_path)
model_device = ckpt["device"]
if model_device != str(self.device):
reload = True
else:
reload = True
if reload:
ckpt = jit.rmvpe_jit_export(
model_path=model_path,
mode="script",
inputs_path=None,
save_path=jit_model_path,
device=device,
is_half=is_half,
)
model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device)
return model
def get_default_model():
model = E2E(4, 1, (2, 2))
ckpt = torch.load(model_path, map_location="cpu")
model.load_state_dict(ckpt)
model.eval()
if is_half:
model = model.half()
else:
model = model.float()
return model
if use_jit:
if is_half and "cpu" in str(self.device):
logger.warning(
"Use default rmvpe model. \
Jit is not supported on the CPU for half floating point"
)
self.model = get_default_model()
else:
self.model = get_jit_model()
else:
self.model = get_default_model()
self.model = self.model.to(device)
cents_mapping = 20 * np.arange(360) + 1997.3794084376191
self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
def mel2hidden(self, mel):
with torch.no_grad():
n_frames = mel.shape[-1]
n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
if n_pad > 0:
mel = F.pad(mel, (0, n_pad), mode="constant")
if "privateuseone" in str(self.device):
onnx_input_name = self.model.get_inputs()[0].name
onnx_outputs_names = self.model.get_outputs()[0].name
hidden = self.model.run(
[onnx_outputs_names],
input_feed={onnx_input_name: mel.cpu().numpy()},
)[0]
else:
mel = mel.half() if self.is_half else mel.float()
hidden = self.model(mel)
return hidden[:, :n_frames]
def decode(self, hidden, thred=0.03):
cents_pred = self.to_local_average_cents(hidden, thred=thred)
f0 = 10 * (2 ** (cents_pred / 1200))
f0[f0 == 10] = 0
# f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
return f0
def infer_from_audio(self, audio, thred=0.03):
# torch.cuda.synchronize()
# t0 = ttime()
if not torch.is_tensor(audio):
audio = torch.from_numpy(audio)
mel = self.mel_extractor(
audio.float().to(self.device).unsqueeze(0), center=True
)
# print(123123123,mel.device.type)
# torch.cuda.synchronize()
# t1 = ttime()
hidden = self.mel2hidden(mel)
# torch.cuda.synchronize()
# t2 = ttime()
# print(234234,hidden.device.type)
if "privateuseone" not in str(self.device):
hidden = hidden.squeeze(0).cpu().numpy()
else:
hidden = hidden[0]
if self.is_half == True:
hidden = hidden.astype("float32")
f0 = self.decode(hidden, thred=thred)
# torch.cuda.synchronize()
# t3 = ttime()
# print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
return f0
def to_local_average_cents(self, salience, thred=0.05):
# t0 = ttime()
center = np.argmax(salience, axis=1) # 帧长#index
salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
# t1 = ttime()
center += 4
todo_salience = []
todo_cents_mapping = []
starts = center - 4
ends = center + 5
for idx in range(salience.shape[0]):
todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
# t2 = ttime()
todo_salience = np.array(todo_salience) # 帧长9
todo_cents_mapping = np.array(todo_cents_mapping) # 帧长9
product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
weight_sum = np.sum(todo_salience, 1) # 帧长
devided = product_sum / weight_sum # 帧长
# t3 = ttime()
maxx = np.max(salience, axis=1) # 帧长
devided[maxx <= thred] = 0
# t4 = ttime()
# print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
return devided
if __name__ == "__main__":
import librosa
import soundfile as sf
audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav")
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
audio_bak = audio.copy()
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt"
thred = 0.03 # 0.01
device = "cuda" if torch.cuda.is_available() else "cpu"
rmvpe = RMVPE(model_path, is_half=False, device=device)
t0 = ttime()
f0 = rmvpe.infer_from_audio(audio, thred=thred)
# f0 = rmvpe.infer_from_audio(audio, thred=thred)
# f0 = rmvpe.infer_from_audio(audio, thred=thred)
# f0 = rmvpe.infer_from_audio(audio, thred=thred)
# f0 = rmvpe.infer_from_audio(audio, thred=thred)
t1 = ttime()
logger.info("%s %.2f", f0.shape, t1 - t0)

@ -0,0 +1,260 @@
import numpy as np
# This function is obtained from librosa.
def get_rms(
y,
frame_length=2048,
hop_length=512,
pad_mode="constant",
):
padding = (int(frame_length // 2), int(frame_length // 2))
y = np.pad(y, padding, mode=pad_mode)
axis = -1
# put our new within-frame axis at the end for now
out_strides = y.strides + tuple([y.strides[axis]])
# Reduce the shape on the framing axis
x_shape_trimmed = list(y.shape)
x_shape_trimmed[axis] -= frame_length - 1
out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
if axis < 0:
target_axis = axis - 1
else:
target_axis = axis + 1
xw = np.moveaxis(xw, -1, target_axis)
# Downsample along the target axis
slices = [slice(None)] * xw.ndim
slices[axis] = slice(0, None, hop_length)
x = xw[tuple(slices)]
# Calculate power
power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
return np.sqrt(power)
class Slicer:
def __init__(
self,
sr: int,
threshold: float = -40.0,
min_length: int = 5000,
min_interval: int = 300,
hop_size: int = 20,
max_sil_kept: int = 5000,
):
if not min_length >= min_interval >= hop_size:
raise ValueError(
"The following condition must be satisfied: min_length >= min_interval >= hop_size"
)
if not max_sil_kept >= hop_size:
raise ValueError(
"The following condition must be satisfied: max_sil_kept >= hop_size"
)
min_interval = sr * min_interval / 1000
self.threshold = 10 ** (threshold / 20.0)
self.hop_size = round(sr * hop_size / 1000)
self.win_size = min(round(min_interval), 4 * self.hop_size)
self.min_length = round(sr * min_length / 1000 / self.hop_size)
self.min_interval = round(min_interval / self.hop_size)
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
def _apply_slice(self, waveform, begin, end):
if len(waveform.shape) > 1:
return waveform[
:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
]
else:
return waveform[
begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
]
# @timeit
def slice(self, waveform):
if len(waveform.shape) > 1:
samples = waveform.mean(axis=0)
else:
samples = waveform
if samples.shape[0] <= self.min_length:
return [waveform]
rms_list = get_rms(
y=samples, frame_length=self.win_size, hop_length=self.hop_size
).squeeze(0)
sil_tags = []
silence_start = None
clip_start = 0
for i, rms in enumerate(rms_list):
# Keep looping while frame is silent.
if rms < self.threshold:
# Record start of silent frames.
if silence_start is None:
silence_start = i
continue
# Keep looping while frame is not silent and silence start has not been recorded.
if silence_start is None:
continue
# Clear recorded silence start if interval is not enough or clip is too short
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
need_slice_middle = (
i - silence_start >= self.min_interval
and i - clip_start >= self.min_length
)
if not is_leading_silence and not need_slice_middle:
silence_start = None
continue
# Need slicing. Record the range of silent frames to be removed.
if i - silence_start <= self.max_sil_kept:
pos = rms_list[silence_start : i + 1].argmin() + silence_start
if silence_start == 0:
sil_tags.append((0, pos))
else:
sil_tags.append((pos, pos))
clip_start = pos
elif i - silence_start <= self.max_sil_kept * 2:
pos = rms_list[
i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
].argmin()
pos += i - self.max_sil_kept
pos_l = (
rms_list[
silence_start : silence_start + self.max_sil_kept + 1
].argmin()
+ silence_start
)
pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i
- self.max_sil_kept
)
if silence_start == 0:
sil_tags.append((0, pos_r))
clip_start = pos_r
else:
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
clip_start = max(pos_r, pos)
else:
pos_l = (
rms_list[
silence_start : silence_start + self.max_sil_kept + 1
].argmin()
+ silence_start
)
pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i
- self.max_sil_kept
)
if silence_start == 0:
sil_tags.append((0, pos_r))
else:
sil_tags.append((pos_l, pos_r))
clip_start = pos_r
silence_start = None
# Deal with trailing silence.
total_frames = rms_list.shape[0]
if (
silence_start is not None
and total_frames - silence_start >= self.min_interval
):
silence_end = min(total_frames, silence_start + self.max_sil_kept)
pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
sil_tags.append((pos, total_frames + 1))
# Apply and return slices.
if len(sil_tags) == 0:
return [waveform]
else:
chunks = []
if sil_tags[0][0] > 0:
chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
for i in range(len(sil_tags) - 1):
chunks.append(
self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
)
if sil_tags[-1][1] < total_frames:
chunks.append(
self._apply_slice(waveform, sil_tags[-1][1], total_frames)
)
return chunks
def main():
import os.path
from argparse import ArgumentParser
import librosa
import soundfile
parser = ArgumentParser()
parser.add_argument("audio", type=str, help="The audio to be sliced")
parser.add_argument(
"--out", type=str, help="Output directory of the sliced audio clips"
)
parser.add_argument(
"--db_thresh",
type=float,
required=False,
default=-40,
help="The dB threshold for silence detection",
)
parser.add_argument(
"--min_length",
type=int,
required=False,
default=5000,
help="The minimum milliseconds required for each sliced audio clip",
)
parser.add_argument(
"--min_interval",
type=int,
required=False,
default=300,
help="The minimum milliseconds for a silence part to be sliced",
)
parser.add_argument(
"--hop_size",
type=int,
required=False,
default=10,
help="Frame length in milliseconds",
)
parser.add_argument(
"--max_sil_kept",
type=int,
required=False,
default=500,
help="The maximum silence length kept around the sliced clip, presented in milliseconds",
)
args = parser.parse_args()
out = args.out
if out is None:
out = os.path.dirname(os.path.abspath(args.audio))
audio, sr = librosa.load(args.audio, sr=None, mono=False)
slicer = Slicer(
sr=sr,
threshold=args.db_thresh,
min_length=args.min_length,
min_interval=args.min_interval,
hop_size=args.hop_size,
max_sil_kept=args.max_sil_kept,
)
chunks = slicer.slice(audio)
if not os.path.exists(out):
os.makedirs(out)
for i, chunk in enumerate(chunks):
if len(chunk.shape) > 1:
chunk = chunk.T
soundfile.write(
os.path.join(
out,
f"%s_%d.wav"
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
),
chunk,
sr,
)
if __name__ == "__main__":
main()

@ -0,0 +1,517 @@
import os
import traceback
import logging
logger = logging.getLogger(__name__)
import numpy as np
import torch
import torch.utils.data
from infer.lib.train.mel_processing import spectrogram_torch
from infer.lib.train.utils import load_filepaths_and_text, load_wav_to_torch
class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
"""
1) loads audio, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 5000)
self._filter()
def _filter(self):
"""
Filter text & store spec lengths
"""
# Store spectrogram lengths for Bucketing
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
# spec_length = wav_length // hop_length
audiopaths_and_text_new = []
lengths = []
for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv])
lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length))
self.audiopaths_and_text = audiopaths_and_text_new
self.lengths = lengths
def get_sid(self, sid):
sid = torch.LongTensor([int(sid)])
return sid
def get_audio_text_pair(self, audiopath_and_text):
# separate filename and text
file = audiopath_and_text[0]
phone = audiopath_and_text[1]
pitch = audiopath_and_text[2]
pitchf = audiopath_and_text[3]
dv = audiopath_and_text[4]
phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf)
spec, wav = self.get_audio(file)
dv = self.get_sid(dv)
len_phone = phone.size()[0]
len_spec = spec.size()[-1]
# print(123,phone.shape,pitch.shape,spec.shape)
if len_phone != len_spec:
len_min = min(len_phone, len_spec)
# amor
len_wav = len_min * self.hop_length
spec = spec[:, :len_min]
wav = wav[:, :len_wav]
phone = phone[:len_min, :]
pitch = pitch[:len_min]
pitchf = pitchf[:len_min]
return (spec, wav, phone, pitch, pitchf, dv)
def get_labels(self, phone, pitch, pitchf):
phone = np.load(phone)
phone = np.repeat(phone, 2, axis=0)
pitch = np.load(pitch)
pitchf = np.load(pitchf)
n_num = min(phone.shape[0], 900) # DistributedBucketSampler
# print(234,phone.shape,pitch.shape)
phone = phone[:n_num, :]
pitch = pitch[:n_num]
pitchf = pitchf[:n_num]
phone = torch.FloatTensor(phone)
pitch = torch.LongTensor(pitch)
pitchf = torch.FloatTensor(pitchf)
return phone, pitch, pitchf
def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != self.sampling_rate:
raise ValueError(
"{} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate
)
)
audio_norm = audio
# audio_norm = audio / self.max_wav_value
# audio_norm = audio / np.abs(audio).max()
audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt")
if os.path.exists(spec_filename):
try:
spec = torch.load(spec_filename)
except:
logger.warning("%s %s", spec_filename, traceback.format_exc())
spec = spectrogram_torch(
audio_norm,
self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
else:
spec = spectrogram_torch(
audio_norm,
self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
return spec, audio_norm
def __getitem__(self, index):
return self.get_audio_text_pair(self.audiopaths_and_text[index])
def __len__(self):
return len(self.audiopaths_and_text)
class TextAudioCollateMultiNSFsid:
"""Zero-pads model inputs and targets"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
def __call__(self, batch):
"""Collate's training batch from normalized text and aduio
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
)
max_spec_len = max([x[0].size(1) for x in batch])
max_wave_len = max([x[1].size(1) for x in batch])
spec_lengths = torch.LongTensor(len(batch))
wave_lengths = torch.LongTensor(len(batch))
spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
spec_padded.zero_()
wave_padded.zero_()
max_phone_len = max([x[2].size(0) for x in batch])
phone_lengths = torch.LongTensor(len(batch))
phone_padded = torch.FloatTensor(
len(batch), max_phone_len, batch[0][2].shape[1]
) # (spec, wav, phone, pitch)
pitch_padded = torch.LongTensor(len(batch), max_phone_len)
pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
phone_padded.zero_()
pitch_padded.zero_()
pitchf_padded.zero_()
# dv = torch.FloatTensor(len(batch), 256)#gin=256
sid = torch.LongTensor(len(batch))
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
spec = row[0]
spec_padded[i, :, : spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wave = row[1]
wave_padded[i, :, : wave.size(1)] = wave
wave_lengths[i] = wave.size(1)
phone = row[2]
phone_padded[i, : phone.size(0), :] = phone
phone_lengths[i] = phone.size(0)
pitch = row[3]
pitch_padded[i, : pitch.size(0)] = pitch
pitchf = row[4]
pitchf_padded[i, : pitchf.size(0)] = pitchf
# dv[i] = row[5]
sid[i] = row[5]
return (
phone_padded,
phone_lengths,
pitch_padded,
pitchf_padded,
spec_padded,
spec_lengths,
wave_padded,
wave_lengths,
# dv
sid,
)
class TextAudioLoader(torch.utils.data.Dataset):
"""
1) loads audio, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 5000)
self._filter()
def _filter(self):
"""
Filter text & store spec lengths
"""
# Store spectrogram lengths for Bucketing
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
# spec_length = wav_length // hop_length
audiopaths_and_text_new = []
lengths = []
for audiopath, text, dv in self.audiopaths_and_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_and_text_new.append([audiopath, text, dv])
lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length))
self.audiopaths_and_text = audiopaths_and_text_new
self.lengths = lengths
def get_sid(self, sid):
sid = torch.LongTensor([int(sid)])
return sid
def get_audio_text_pair(self, audiopath_and_text):
# separate filename and text
file = audiopath_and_text[0]
phone = audiopath_and_text[1]
dv = audiopath_and_text[2]
phone = self.get_labels(phone)
spec, wav = self.get_audio(file)
dv = self.get_sid(dv)
len_phone = phone.size()[0]
len_spec = spec.size()[-1]
if len_phone != len_spec:
len_min = min(len_phone, len_spec)
len_wav = len_min * self.hop_length
spec = spec[:, :len_min]
wav = wav[:, :len_wav]
phone = phone[:len_min, :]
return (spec, wav, phone, dv)
def get_labels(self, phone):
phone = np.load(phone)
phone = np.repeat(phone, 2, axis=0)
n_num = min(phone.shape[0], 900) # DistributedBucketSampler
phone = phone[:n_num, :]
phone = torch.FloatTensor(phone)
return phone
def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != self.sampling_rate:
raise ValueError(
"{} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate
)
)
audio_norm = audio
# audio_norm = audio / self.max_wav_value
# audio_norm = audio / np.abs(audio).max()
audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt")
if os.path.exists(spec_filename):
try:
spec = torch.load(spec_filename)
except:
logger.warning("%s %s", spec_filename, traceback.format_exc())
spec = spectrogram_torch(
audio_norm,
self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
else:
spec = spectrogram_torch(
audio_norm,
self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
return spec, audio_norm
def __getitem__(self, index):
return self.get_audio_text_pair(self.audiopaths_and_text[index])
def __len__(self):
return len(self.audiopaths_and_text)
class TextAudioCollate:
"""Zero-pads model inputs and targets"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
def __call__(self, batch):
"""Collate's training batch from normalized text and aduio
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
)
max_spec_len = max([x[0].size(1) for x in batch])
max_wave_len = max([x[1].size(1) for x in batch])
spec_lengths = torch.LongTensor(len(batch))
wave_lengths = torch.LongTensor(len(batch))
spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
spec_padded.zero_()
wave_padded.zero_()
max_phone_len = max([x[2].size(0) for x in batch])
phone_lengths = torch.LongTensor(len(batch))
phone_padded = torch.FloatTensor(
len(batch), max_phone_len, batch[0][2].shape[1]
)
phone_padded.zero_()
sid = torch.LongTensor(len(batch))
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
spec = row[0]
spec_padded[i, :, : spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wave = row[1]
wave_padded[i, :, : wave.size(1)] = wave
wave_lengths[i] = wave.size(1)
phone = row[2]
phone_padded[i, : phone.size(0), :] = phone
phone_lengths[i] = phone.size(0)
sid[i] = row[3]
return (
phone_padded,
phone_lengths,
spec_padded,
spec_lengths,
wave_padded,
wave_lengths,
sid,
)
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
"""
Maintain similar input lengths in a batch.
Length groups are specified by boundaries.
Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
It removes samples which are not included in the boundaries.
Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
"""
def __init__(
self,
dataset,
batch_size,
boundaries,
num_replicas=None,
rank=None,
shuffle=True,
):
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
self.lengths = dataset.lengths
self.batch_size = batch_size
self.boundaries = boundaries
self.buckets, self.num_samples_per_bucket = self._create_buckets()
self.total_size = sum(self.num_samples_per_bucket)
self.num_samples = self.total_size // self.num_replicas
def _create_buckets(self):
buckets = [[] for _ in range(len(self.boundaries) - 1)]
for i in range(len(self.lengths)):
length = self.lengths[i]
idx_bucket = self._bisect(length)
if idx_bucket != -1:
buckets[idx_bucket].append(i)
for i in range(len(buckets) - 1, -1, -1): #
if len(buckets[i]) == 0:
buckets.pop(i)
self.boundaries.pop(i + 1)
num_samples_per_bucket = []
for i in range(len(buckets)):
len_bucket = len(buckets[i])
total_batch_size = self.num_replicas * self.batch_size
rem = (
total_batch_size - (len_bucket % total_batch_size)
) % total_batch_size
num_samples_per_bucket.append(len_bucket + rem)
return buckets, num_samples_per_bucket
def __iter__(self):
# deterministically shuffle based on epoch
g = torch.Generator()
g.manual_seed(self.epoch)
indices = []
if self.shuffle:
for bucket in self.buckets:
indices.append(torch.randperm(len(bucket), generator=g).tolist())
else:
for bucket in self.buckets:
indices.append(list(range(len(bucket))))
batches = []
for i in range(len(self.buckets)):
bucket = self.buckets[i]
len_bucket = len(bucket)
ids_bucket = indices[i]
num_samples_bucket = self.num_samples_per_bucket[i]
# add extra samples to make it evenly divisible
rem = num_samples_bucket - len_bucket
ids_bucket = (
ids_bucket
+ ids_bucket * (rem // len_bucket)
+ ids_bucket[: (rem % len_bucket)]
)
# subsample
ids_bucket = ids_bucket[self.rank :: self.num_replicas]
# batching
for j in range(len(ids_bucket) // self.batch_size):
batch = [
bucket[idx]
for idx in ids_bucket[
j * self.batch_size : (j + 1) * self.batch_size
]
]
batches.append(batch)
if self.shuffle:
batch_ids = torch.randperm(len(batches), generator=g).tolist()
batches = [batches[i] for i in batch_ids]
self.batches = batches
assert len(self.batches) * self.batch_size == self.num_samples
return iter(self.batches)
def _bisect(self, x, lo=0, hi=None):
if hi is None:
hi = len(self.boundaries) - 1
if hi > lo:
mid = (hi + lo) // 2
if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
return mid
elif x <= self.boundaries[mid]:
return self._bisect(x, lo, mid)
else:
return self._bisect(x, mid + 1, hi)
else:
return -1
def __len__(self):
return self.num_samples // self.batch_size

@ -0,0 +1,58 @@
import torch
def feature_loss(fmap_r, fmap_g):
loss = 0
for dr, dg in zip(fmap_r, fmap_g):
for rl, gl in zip(dr, dg):
rl = rl.float().detach()
gl = gl.float()
loss += torch.mean(torch.abs(rl - gl))
return loss * 2
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
loss = 0
r_losses = []
g_losses = []
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
dr = dr.float()
dg = dg.float()
r_loss = torch.mean((1 - dr) ** 2)
g_loss = torch.mean(dg**2)
loss += r_loss + g_loss
r_losses.append(r_loss.item())
g_losses.append(g_loss.item())
return loss, r_losses, g_losses
def generator_loss(disc_outputs):
loss = 0
gen_losses = []
for dg in disc_outputs:
dg = dg.float()
l = torch.mean((1 - dg) ** 2)
gen_losses.append(l)
loss += l
return loss, gen_losses
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
"""
z_p, logs_q: [b, h, t_t]
m_p, logs_p: [b, h, t_t]
"""
z_p = z_p.float()
logs_q = logs_q.float()
m_p = m_p.float()
logs_p = logs_p.float()
z_mask = z_mask.float()
kl = logs_p - logs_q - 0.5
kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
kl = torch.sum(kl * z_mask)
l = kl / torch.sum(z_mask)
return l

@ -0,0 +1,127 @@
import torch
import torch.utils.data
from librosa.filters import mel as librosa_mel_fn
import logging
logger = logging.getLogger(__name__)
MAX_WAV_VALUE = 32768.0
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
"""
PARAMS
------
C: compression factor
"""
return torch.log(torch.clamp(x, min=clip_val) * C)
def dynamic_range_decompression_torch(x, C=1):
"""
PARAMS
------
C: compression factor used to compress
"""
return torch.exp(x) / C
def spectral_normalize_torch(magnitudes):
return dynamic_range_compression_torch(magnitudes)
def spectral_de_normalize_torch(magnitudes):
return dynamic_range_decompression_torch(magnitudes)
# Reusable banks
mel_basis = {}
hann_window = {}
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
"""Convert waveform into Linear-frequency Linear-amplitude spectrogram.
Args:
y :: (B, T) - Audio waveforms
n_fft
sampling_rate
hop_size
win_size
center
Returns:
:: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram
"""
# Window - Cache if needed
global hann_window
dtype_device = str(y.dtype) + "_" + str(y.device)
wnsize_dtype_device = str(win_size) + "_" + dtype_device
if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
dtype=y.dtype, device=y.device
)
# Padding
y = torch.nn.functional.pad(
y.unsqueeze(1),
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
mode="reflect",
)
y = y.squeeze(1)
# Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2)
spec = torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window[wnsize_dtype_device],
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=True,
)
# Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame)
spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
return spec
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
# MelBasis - Cache if needed
global mel_basis
dtype_device = str(spec.dtype) + "_" + str(spec.device)
fmax_dtype_device = str(fmax) + "_" + dtype_device
if fmax_dtype_device not in mel_basis:
mel = librosa_mel_fn(
sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
)
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
dtype=spec.dtype, device=spec.device
)
# Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame)
melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
melspec = spectral_normalize_torch(melspec)
return melspec
def mel_spectrogram_torch(
y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
):
"""Convert waveform into Mel-frequency Log-amplitude spectrogram.
Args:
y :: (B, T) - Waveforms
Returns:
melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
"""
# Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
# Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
return melspec

@ -0,0 +1,261 @@
import os
import sys
import traceback
from collections import OrderedDict
import torch
from i18n.i18n import I18nAuto
i18n = I18nAuto()
def savee(ckpt, sr, if_f0, name, epoch, version, hps):
try:
opt = OrderedDict()
opt["weight"] = {}
for key in ckpt.keys():
if "enc_q" in key:
continue
opt["weight"][key] = ckpt[key].half()
opt["config"] = [
hps.data.filter_length // 2 + 1,
32,
hps.model.inter_channels,
hps.model.hidden_channels,
hps.model.filter_channels,
hps.model.n_heads,
hps.model.n_layers,
hps.model.kernel_size,
hps.model.p_dropout,
hps.model.resblock,
hps.model.resblock_kernel_sizes,
hps.model.resblock_dilation_sizes,
hps.model.upsample_rates,
hps.model.upsample_initial_channel,
hps.model.upsample_kernel_sizes,
hps.model.spk_embed_dim,
hps.model.gin_channels,
hps.data.sampling_rate,
]
opt["info"] = "%sepoch" % epoch
opt["sr"] = sr
opt["f0"] = if_f0
opt["version"] = version
torch.save(opt, "assets/weights/%s.pth" % name)
return "Success."
except:
return traceback.format_exc()
def show_info(path):
try:
a = torch.load(path, map_location="cpu")
return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % (
a.get("info", "None"),
a.get("sr", "None"),
a.get("f0", "None"),
a.get("version", "None"),
)
except:
return traceback.format_exc()
def extract_small_model(path, name, sr, if_f0, info, version):
try:
ckpt = torch.load(path, map_location="cpu")
if "model" in ckpt:
ckpt = ckpt["model"]
opt = OrderedDict()
opt["weight"] = {}
for key in ckpt.keys():
if "enc_q" in key:
continue
opt["weight"][key] = ckpt[key].half()
if sr == "40k":
opt["config"] = [
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 10, 2, 2],
512,
[16, 16, 4, 4],
109,
256,
40000,
]
elif sr == "48k":
if version == "v1":
opt["config"] = [
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 6, 2, 2, 2],
512,
[16, 16, 4, 4, 4],
109,
256,
48000,
]
else:
opt["config"] = [
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[12, 10, 2, 2],
512,
[24, 20, 4, 4],
109,
256,
48000,
]
elif sr == "32k":
if version == "v1":
opt["config"] = [
513,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 4, 2, 2, 2],
512,
[16, 16, 4, 4, 4],
109,
256,
32000,
]
else:
opt["config"] = [
513,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 8, 2, 2],
512,
[20, 16, 4, 4],
109,
256,
32000,
]
if info == "":
info = "Extracted model."
opt["info"] = info
opt["version"] = version
opt["sr"] = sr
opt["f0"] = int(if_f0)
torch.save(opt, "assets/weights/%s.pth" % name)
return "Success."
except:
return traceback.format_exc()
def change_info(path, info, name):
try:
ckpt = torch.load(path, map_location="cpu")
ckpt["info"] = info
if name == "":
name = os.path.basename(path)
torch.save(ckpt, "assets/weights/%s" % name)
return "Success."
except:
return traceback.format_exc()
def merge(path1, path2, alpha1, sr, f0, info, name, version):
try:
def extract(ckpt):
a = ckpt["model"]
opt = OrderedDict()
opt["weight"] = {}
for key in a.keys():
if "enc_q" in key:
continue
opt["weight"][key] = a[key]
return opt
ckpt1 = torch.load(path1, map_location="cpu")
ckpt2 = torch.load(path2, map_location="cpu")
cfg = ckpt1["config"]
if "model" in ckpt1:
ckpt1 = extract(ckpt1)
else:
ckpt1 = ckpt1["weight"]
if "model" in ckpt2:
ckpt2 = extract(ckpt2)
else:
ckpt2 = ckpt2["weight"]
if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
return "Fail to merge the models. The model architectures are not the same."
opt = OrderedDict()
opt["weight"] = {}
for key in ckpt1.keys():
# try:
if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
opt["weight"][key] = (
alpha1 * (ckpt1[key][:min_shape0].float())
+ (1 - alpha1) * (ckpt2[key][:min_shape0].float())
).half()
else:
opt["weight"][key] = (
alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float())
).half()
# except:
# pdb.set_trace()
opt["config"] = cfg
"""
if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000]
elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000]
elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
"""
opt["sr"] = sr
opt["f0"] = 1 if f0 == i18n("") else 0
opt["version"] = version
opt["info"] = info
torch.save(opt, "assets/weights/%s.pth" % name)
return "Success."
except:
return traceback.format_exc()

@ -0,0 +1,478 @@
import argparse
import glob
import json
import logging
import os
import subprocess
import sys
import shutil
import numpy as np
import torch
from scipy.io.wavfile import read
MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging
def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
##################
def go(model, bkey):
saved_state_dict = checkpoint_dict[bkey]
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items(): # 模型需要的shape
try:
new_state_dict[k] = saved_state_dict[k]
if saved_state_dict[k].shape != state_dict[k].shape:
logger.warning(
"shape-%s-mismatch. need: %s, get: %s",
k,
state_dict[k].shape,
saved_state_dict[k].shape,
) #
raise KeyError
except:
# logger.info(traceback.format_exc())
logger.info("%s is not in the checkpoint", k) # pretrain缺失的
new_state_dict[k] = v # 模型自带的随机值
if hasattr(model, "module"):
model.module.load_state_dict(new_state_dict, strict=False)
else:
model.load_state_dict(new_state_dict, strict=False)
return model
go(combd, "combd")
model = go(sbd, "sbd")
#############
logger.info("Loaded model weights")
iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict["learning_rate"]
if (
optimizer is not None and load_opt == 1
): ###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch
# try:
optimizer.load_state_dict(checkpoint_dict["optimizer"])
# except:
# traceback.print_exc()
logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
# def load_checkpoint(checkpoint_path, model, optimizer=None):
# assert os.path.isfile(checkpoint_path)
# checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
# iteration = checkpoint_dict['iteration']
# learning_rate = checkpoint_dict['learning_rate']
# if optimizer is not None:
# optimizer.load_state_dict(checkpoint_dict['optimizer'])
# # print(1111)
# saved_state_dict = checkpoint_dict['model']
# # print(1111)
#
# if hasattr(model, 'module'):
# state_dict = model.module.state_dict()
# else:
# state_dict = model.state_dict()
# new_state_dict= {}
# for k, v in state_dict.items():
# try:
# new_state_dict[k] = saved_state_dict[k]
# except:
# logger.info("%s is not in the checkpoint" % k)
# new_state_dict[k] = v
# if hasattr(model, 'module'):
# model.module.load_state_dict(new_state_dict)
# else:
# model.load_state_dict(new_state_dict)
# logger.info("Loaded checkpoint '{}' (epoch {})" .format(
# checkpoint_path, iteration))
# return model, optimizer, learning_rate, iteration
def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
saved_state_dict = checkpoint_dict["model"]
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items(): # 模型需要的shape
try:
new_state_dict[k] = saved_state_dict[k]
if saved_state_dict[k].shape != state_dict[k].shape:
logger.warning(
"shape-%s-mismatch|need-%s|get-%s",
k,
state_dict[k].shape,
saved_state_dict[k].shape,
) #
raise KeyError
except:
# logger.info(traceback.format_exc())
logger.info("%s is not in the checkpoint", k) # pretrain缺失的
new_state_dict[k] = v # 模型自带的随机值
if hasattr(model, "module"):
model.module.load_state_dict(new_state_dict, strict=False)
else:
model.load_state_dict(new_state_dict, strict=False)
logger.info("Loaded model weights")
iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict["learning_rate"]
if (
optimizer is not None and load_opt == 1
): ###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch
# try:
optimizer.load_state_dict(checkpoint_dict["optimizer"])
# except:
# traceback.print_exc()
logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
logger.info(
"Saving model and optimizer state at epoch {} to {}".format(
iteration, checkpoint_path
)
)
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save(
{
"model": state_dict,
"iteration": iteration,
"optimizer": optimizer.state_dict(),
"learning_rate": learning_rate,
},
checkpoint_path,
)
def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
logger.info(
"Saving model and optimizer state at epoch {} to {}".format(
iteration, checkpoint_path
)
)
if hasattr(combd, "module"):
state_dict_combd = combd.module.state_dict()
else:
state_dict_combd = combd.state_dict()
if hasattr(sbd, "module"):
state_dict_sbd = sbd.module.state_dict()
else:
state_dict_sbd = sbd.state_dict()
torch.save(
{
"combd": state_dict_combd,
"sbd": state_dict_sbd,
"iteration": iteration,
"optimizer": optimizer.state_dict(),
"learning_rate": learning_rate,
},
checkpoint_path,
)
def summarize(
writer,
global_step,
scalars={},
histograms={},
images={},
audios={},
audio_sampling_rate=22050,
):
for k, v in scalars.items():
writer.add_scalar(k, v, global_step)
for k, v in histograms.items():
writer.add_histogram(k, v, global_step)
for k, v in images.items():
writer.add_image(k, v, global_step, dataformats="HWC")
for k, v in audios.items():
writer.add_audio(k, v, global_step, audio_sampling_rate)
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
f_list = glob.glob(os.path.join(dir_path, regex))
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
x = f_list[-1]
logger.debug(x)
return x
def plot_spectrogram_to_numpy(spectrogram):
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger("matplotlib")
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
plt.colorbar(im, ax=ax)
plt.xlabel("Frames")
plt.ylabel("Channels")
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def plot_alignment_to_numpy(alignment, info=None):
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger("matplotlib")
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(6, 4))
im = ax.imshow(
alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
)
fig.colorbar(im, ax=ax)
xlabel = "Decoder timestep"
if info is not None:
xlabel += "\n\n" + info
plt.xlabel(xlabel)
plt.ylabel("Encoder timestep")
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def load_wav_to_torch(full_path):
sampling_rate, data = read(full_path)
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
def load_filepaths_and_text(filename, split="|"):
with open(filename, encoding="utf-8") as f:
filepaths_and_text = [line.strip().split(split) for line in f]
return filepaths_and_text
def get_hparams(init=True):
"""
todo:
结尾七人组
保存频率总epoch done
bs done
pretrainGpretrainD done
卡号os.en["CUDA_VISIBLE_DEVICES"] done
if_latest done
模型if_f0 done
采样率自动选择config done
是否缓存数据集进GPU:if_cache_data_in_gpu done
-m:
自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done
-c不要了
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-se",
"--save_every_epoch",
type=int,
required=True,
help="checkpoint save frequency (epoch)",
)
parser.add_argument(
"-te", "--total_epoch", type=int, required=True, help="total_epoch"
)
parser.add_argument(
"-pg", "--pretrainG", type=str, default="", help="Pretrained Generator path"
)
parser.add_argument(
"-pd", "--pretrainD", type=str, default="", help="Pretrained Discriminator path"
)
parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -")
parser.add_argument(
"-bs", "--batch_size", type=int, required=True, help="batch size"
)
parser.add_argument(
"-e", "--experiment_dir", type=str, required=True, help="experiment dir"
) # -m
parser.add_argument(
"-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k"
)
parser.add_argument(
"-sw",
"--save_every_weights",
type=str,
default="0",
help="save the extracted model in weights directory when saving checkpoints",
)
parser.add_argument(
"-v", "--version", type=str, required=True, help="model version"
)
parser.add_argument(
"-f0",
"--if_f0",
type=int,
required=True,
help="use f0 as one of the inputs of the model, 1 or 0",
)
parser.add_argument(
"-l",
"--if_latest",
type=int,
required=True,
help="if only save the latest G/D pth file, 1 or 0",
)
parser.add_argument(
"-c",
"--if_cache_data_in_gpu",
type=int,
required=True,
help="if caching the dataset in GPU memory, 1 or 0",
)
args = parser.parse_args()
name = args.experiment_dir
experiment_dir = os.path.join("./logs", args.experiment_dir)
config_save_path = os.path.join(experiment_dir, "config.json")
with open(config_save_path, "r") as f:
config = json.load(f)
hparams = HParams(**config)
hparams.model_dir = hparams.experiment_dir = experiment_dir
hparams.save_every_epoch = args.save_every_epoch
hparams.name = name
hparams.total_epoch = args.total_epoch
hparams.pretrainG = args.pretrainG
hparams.pretrainD = args.pretrainD
hparams.version = args.version
hparams.gpus = args.gpus
hparams.train.batch_size = args.batch_size
hparams.sample_rate = args.sample_rate
hparams.if_f0 = args.if_f0
hparams.if_latest = args.if_latest
hparams.save_every_weights = args.save_every_weights
hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
hparams.data.training_files = "%s/filelist.txt" % experiment_dir
return hparams
def get_hparams_from_dir(model_dir):
config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
hparams.model_dir = model_dir
return hparams
def get_hparams_from_file(config_path):
with open(config_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
return hparams
def check_git_hash(model_dir):
source_dir = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(os.path.join(source_dir, ".git")):
logger.warning(
"{} is not a git repository, therefore hash value comparison will be ignored.".format(
source_dir
)
)
return
cur_hash = subprocess.getoutput("git rev-parse HEAD")
path = os.path.join(model_dir, "githash")
if os.path.exists(path):
saved_hash = open(path).read()
if saved_hash != cur_hash:
logger.warning(
"git hash values are different. {}(saved) != {}(current)".format(
saved_hash[:8], cur_hash[:8]
)
)
else:
open(path, "w").write(cur_hash)
def get_logger(model_dir, filename="train.log"):
global logger
logger = logging.getLogger(os.path.basename(model_dir))
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir):
os.makedirs(model_dir)
h = logging.FileHandler(os.path.join(model_dir, filename))
h.setLevel(logging.DEBUG)
h.setFormatter(formatter)
logger.addHandler(h)
return logger
class HParams:
def __init__(self, **kwargs):
for k, v in kwargs.items():
if type(v) == dict:
v = HParams(**v)
self[k] = v
def keys(self):
return self.__dict__.keys()
def items(self):
return self.__dict__.items()
def values(self):
return self.__dict__.values()
def __len__(self):
return len(self.__dict__)
def __getitem__(self, key):
return getattr(self, key)
def __setitem__(self, key, value):
return setattr(self, key, value)
def __contains__(self, key):
return key in self.__dict__
def __repr__(self):
return self.__dict__.__repr__()

@ -0,0 +1,183 @@
import os
import random
import numpy as np
import torch
import torch.utils.data
from tqdm import tqdm
from . import spec_utils
class VocalRemoverValidationSet(torch.utils.data.Dataset):
def __init__(self, patch_list):
self.patch_list = patch_list
def __len__(self):
return len(self.patch_list)
def __getitem__(self, idx):
path = self.patch_list[idx]
data = np.load(path)
X, y = data["X"], data["y"]
X_mag = np.abs(X)
y_mag = np.abs(y)
return X_mag, y_mag
def make_pair(mix_dir, inst_dir):
input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
X_list = sorted(
[
os.path.join(mix_dir, fname)
for fname in os.listdir(mix_dir)
if os.path.splitext(fname)[1] in input_exts
]
)
y_list = sorted(
[
os.path.join(inst_dir, fname)
for fname in os.listdir(inst_dir)
if os.path.splitext(fname)[1] in input_exts
]
)
filelist = list(zip(X_list, y_list))
return filelist
def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
if split_mode == "random":
filelist = make_pair(
os.path.join(dataset_dir, "mixtures"),
os.path.join(dataset_dir, "instruments"),
)
random.shuffle(filelist)
if len(val_filelist) == 0:
val_size = int(len(filelist) * val_rate)
train_filelist = filelist[:-val_size]
val_filelist = filelist[-val_size:]
else:
train_filelist = [
pair for pair in filelist if list(pair) not in val_filelist
]
elif split_mode == "subdirs":
if len(val_filelist) != 0:
raise ValueError(
"The `val_filelist` option is not available in `subdirs` mode"
)
train_filelist = make_pair(
os.path.join(dataset_dir, "training/mixtures"),
os.path.join(dataset_dir, "training/instruments"),
)
val_filelist = make_pair(
os.path.join(dataset_dir, "validation/mixtures"),
os.path.join(dataset_dir, "validation/instruments"),
)
return train_filelist, val_filelist
def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
perm = np.random.permutation(len(X))
for i, idx in enumerate(tqdm(perm)):
if np.random.uniform() < reduction_rate:
y[idx] = spec_utils.reduce_vocal_aggressively(
X[idx], y[idx], reduction_mask
)
if np.random.uniform() < 0.5:
# swap channel
X[idx] = X[idx, ::-1]
y[idx] = y[idx, ::-1]
if np.random.uniform() < 0.02:
# mono
X[idx] = X[idx].mean(axis=0, keepdims=True)
y[idx] = y[idx].mean(axis=0, keepdims=True)
if np.random.uniform() < 0.02:
# inst
X[idx] = y[idx]
if np.random.uniform() < mixup_rate and i < len(perm) - 1:
lam = np.random.beta(mixup_alpha, mixup_alpha)
X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
return X, y
def make_padding(width, cropsize, offset):
left = offset
roi_size = cropsize - left * 2
if roi_size == 0:
roi_size = cropsize
right = roi_size - (width % roi_size) + left
return left, right, roi_size
def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
len_dataset = patches * len(filelist)
X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
coef = np.max([np.abs(X).max(), np.abs(y).max()])
X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
ends = starts + cropsize
for j in range(patches):
idx = i * patches + j
X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
return X_dataset, y_dataset
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
patch_list = []
patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
cropsize, sr, hop_length, n_fft, offset
)
os.makedirs(patch_dir, exist_ok=True)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
basename = os.path.splitext(os.path.basename(X_path))[0]
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
coef = np.max([np.abs(X).max(), np.abs(y).max()])
X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
len_dataset = int(np.ceil(X.shape[2] / roi_size))
for j in range(len_dataset):
outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
start = j * roi_size
if not os.path.exists(outpath):
np.savez(
outpath,
X=X_pad[:, :, start : start + cropsize],
y=y_pad[:, :, start : start + cropsize],
)
patch_list.append(outpath)
return VocalRemoverValidationSet(patch_list)

@ -0,0 +1,118 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False,
),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
bottle = self.bottleneck(out)
return bottle

@ -0,0 +1,118 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False,
),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
bottle = self.bottleneck(out)
return bottle

@ -0,0 +1,118 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False,
),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
bottle = self.bottleneck(out)
return bottle

@ -0,0 +1,126 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False,
),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
feat6 = self.conv6(x)
feat7 = self.conv7(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
bottle = self.bottleneck(out)
return bottle

@ -0,0 +1,126 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False,
),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
feat6 = self.conv6(x)
feat7 = self.conv7(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
bottle = self.bottleneck(out)
return bottle

@ -0,0 +1,126 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False,
),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
feat6 = self.conv6(x)
feat7 = self.conv7(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
bottle = self.bottleneck(out)
return bottle

@ -0,0 +1,125 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False,
),
nn.BatchNorm2d(nout),
activ(),
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
def __call__(self, x):
h = self.conv1(x)
h = self.conv2(h)
return h
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
# self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv1(x)
# h = self.conv2(h)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
self.conv3 = Conv2DBNActiv(
nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = Conv2DBNActiv(
nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = Conv2DBNActiv(
nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
out = self.bottleneck(out)
if self.dropout is not None:
out = self.dropout(out)
return out
class LSTMModule(nn.Module):
def __init__(self, nin_conv, nin_lstm, nout_lstm):
super(LSTMModule, self).__init__()
self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
self.lstm = nn.LSTM(
input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
)
self.dense = nn.Sequential(
nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
)
def forward(self, x):
N, _, nbins, nframes = x.size()
h = self.conv(x)[:, 0] # N, nbins, nframes
h = h.permute(2, 0, 1) # nframes, N, nbins
h, _ = self.lstm(h)
h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
h = h.reshape(nframes, N, 1, nbins)
h = h.permute(1, 2, 3, 0)
return h

@ -0,0 +1,69 @@
import json
import os
import pathlib
default_param = {}
default_param["bins"] = 768
default_param["unstable_bins"] = 9 # training only
default_param["reduction_bins"] = 762 # training only
default_param["sr"] = 44100
default_param["pre_filter_start"] = 757
default_param["pre_filter_stop"] = 768
default_param["band"] = {}
default_param["band"][1] = {
"sr": 11025,
"hl": 128,
"n_fft": 960,
"crop_start": 0,
"crop_stop": 245,
"lpf_start": 61, # inference only
"res_type": "polyphase",
}
default_param["band"][2] = {
"sr": 44100,
"hl": 512,
"n_fft": 1536,
"crop_start": 24,
"crop_stop": 547,
"hpf_start": 81, # inference only
"res_type": "sinc_best",
}
def int_keys(d):
r = {}
for k, v in d:
if k.isdigit():
k = int(k)
r[k] = v
return r
class ModelParameters(object):
def __init__(self, config_path=""):
if ".pth" == pathlib.Path(config_path).suffix:
import zipfile
with zipfile.ZipFile(config_path, "r") as zip:
self.param = json.loads(
zip.read("param.json"), object_pairs_hook=int_keys
)
elif ".json" == pathlib.Path(config_path).suffix:
with open(config_path, "r") as f:
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
else:
self.param = default_param
for k in [
"mid_side",
"mid_side_b",
"mid_side_b2",
"stereo_w",
"stereo_n",
"reverse",
]:
if not k in self.param:
self.param[k] = False

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 16000,
"hl": 512,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 16000,
"pre_filter_start": 1023,
"pre_filter_stop": 1024
}

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 32000,
"hl": 512,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "kaiser_fast"
}
},
"sr": 32000,
"pre_filter_start": 1000,
"pre_filter_stop": 1021
}

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 33075,
"hl": 384,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 33075,
"pre_filter_start": 1000,
"pre_filter_stop": 1021
}

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 44100,
"hl": 1024,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 44100,
"pre_filter_start": 1023,
"pre_filter_stop": 1024
}

@ -0,0 +1,19 @@
{
"bins": 256,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 44100,
"hl": 256,
"n_fft": 512,
"crop_start": 0,
"crop_stop": 256,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 44100,
"pre_filter_start": 256,
"pre_filter_stop": 256
}

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 44100,
"hl": 512,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 44100,
"pre_filter_start": 1023,
"pre_filter_stop": 1024
}

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 44100,
"hl": 512,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 700,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 44100,
"pre_filter_start": 1023,
"pre_filter_stop": 700
}

@ -0,0 +1,30 @@
{
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 705,
"band": {
"1": {
"sr": 6000,
"hl": 66,
"n_fft": 512,
"crop_start": 0,
"crop_stop": 240,
"lpf_start": 60,
"lpf_stop": 118,
"res_type": "sinc_fastest"
},
"2": {
"sr": 32000,
"hl": 352,
"n_fft": 1024,
"crop_start": 22,
"crop_stop": 505,
"hpf_start": 44,
"hpf_stop": 23,
"res_type": "sinc_medium"
}
},
"sr": 32000,
"pre_filter_start": 710,
"pre_filter_stop": 731
}

@ -0,0 +1,30 @@
{
"bins": 512,
"unstable_bins": 7,
"reduction_bins": 510,
"band": {
"1": {
"sr": 11025,
"hl": 160,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 192,
"lpf_start": 41,
"lpf_stop": 139,
"res_type": "sinc_fastest"
},
"2": {
"sr": 44100,
"hl": 640,
"n_fft": 1024,
"crop_start": 10,
"crop_stop": 320,
"hpf_start": 47,
"hpf_stop": 15,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 510,
"pre_filter_stop": 512
}

@ -0,0 +1,30 @@
{
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 705,
"band": {
"1": {
"sr": 6000,
"hl": 66,
"n_fft": 512,
"crop_start": 0,
"crop_stop": 240,
"lpf_start": 60,
"lpf_stop": 240,
"res_type": "sinc_fastest"
},
"2": {
"sr": 48000,
"hl": 528,
"n_fft": 1536,
"crop_start": 22,
"crop_stop": 505,
"hpf_start": 82,
"hpf_stop": 22,
"res_type": "sinc_medium"
}
},
"sr": 48000,
"pre_filter_start": 710,
"pre_filter_stop": 731
}

@ -0,0 +1,42 @@
{
"bins": 768,
"unstable_bins": 5,
"reduction_bins": 733,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 278,
"lpf_start": 28,
"lpf_stop": 140,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 256,
"n_fft": 768,
"crop_start": 14,
"crop_stop": 322,
"hpf_start": 70,
"hpf_stop": 14,
"lpf_start": 283,
"lpf_stop": 314,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 131,
"crop_stop": 313,
"hpf_start": 154,
"hpf_stop": 141,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 757,
"pre_filter_stop": 768
}

@ -0,0 +1,43 @@
{
"mid_side": true,
"bins": 768,
"unstable_bins": 5,
"reduction_bins": 733,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 278,
"lpf_start": 28,
"lpf_stop": 140,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 256,
"n_fft": 768,
"crop_start": 14,
"crop_stop": 322,
"hpf_start": 70,
"hpf_stop": 14,
"lpf_start": 283,
"lpf_stop": 314,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 131,
"crop_stop": 313,
"hpf_start": 154,
"hpf_stop": 141,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 757,
"pre_filter_stop": 768
}

@ -0,0 +1,43 @@
{
"mid_side_b2": true,
"bins": 640,
"unstable_bins": 7,
"reduction_bins": 565,
"band": {
"1": {
"sr": 11025,
"hl": 108,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 187,
"lpf_start": 92,
"lpf_stop": 186,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 216,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 212,
"hpf_start": 68,
"hpf_stop": 34,
"lpf_start": 174,
"lpf_stop": 209,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 432,
"n_fft": 640,
"crop_start": 66,
"crop_stop": 307,
"hpf_start": 86,
"hpf_stop": 72,
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 639,
"pre_filter_stop": 640
}

@ -0,0 +1,54 @@
{
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

@ -0,0 +1,55 @@
{
"bins": 768,
"unstable_bins": 7,
"mid_side": true,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

@ -0,0 +1,55 @@
{
"mid_side_b": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

@ -0,0 +1,55 @@
{
"mid_side_b": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

@ -0,0 +1,55 @@
{
"reverse": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

@ -0,0 +1,55 @@
{
"stereo_w": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

@ -0,0 +1,54 @@
{
"bins": 672,
"unstable_bins": 8,
"reduction_bins": 637,
"band": {
"1": {
"sr": 7350,
"hl": 80,
"n_fft": 640,
"crop_start": 0,
"crop_stop": 85,
"lpf_start": 25,
"lpf_stop": 53,
"res_type": "polyphase"
},
"2": {
"sr": 7350,
"hl": 80,
"n_fft": 320,
"crop_start": 4,
"crop_stop": 87,
"hpf_start": 25,
"hpf_stop": 12,
"lpf_start": 31,
"lpf_stop": 62,
"res_type": "polyphase"
},
"3": {
"sr": 14700,
"hl": 160,
"n_fft": 512,
"crop_start": 17,
"crop_stop": 216,
"hpf_start": 48,
"hpf_stop": 24,
"lpf_start": 139,
"lpf_stop": 210,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 480,
"n_fft": 960,
"crop_start": 78,
"crop_stop": 383,
"hpf_start": 130,
"hpf_stop": 86,
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 668,
"pre_filter_stop": 672
}

@ -0,0 +1,55 @@
{
"bins": 672,
"unstable_bins": 8,
"reduction_bins": 637,
"band": {
"1": {
"sr": 7350,
"hl": 80,
"n_fft": 640,
"crop_start": 0,
"crop_stop": 85,
"lpf_start": 25,
"lpf_stop": 53,
"res_type": "polyphase"
},
"2": {
"sr": 7350,
"hl": 80,
"n_fft": 320,
"crop_start": 4,
"crop_stop": 87,
"hpf_start": 25,
"hpf_stop": 12,
"lpf_start": 31,
"lpf_stop": 62,
"res_type": "polyphase"
},
"3": {
"sr": 14700,
"hl": 160,
"n_fft": 512,
"crop_start": 17,
"crop_stop": 216,
"hpf_start": 48,
"hpf_stop": 24,
"lpf_start": 139,
"lpf_stop": 210,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 480,
"n_fft": 960,
"crop_start": 78,
"crop_stop": 383,
"hpf_start": 130,
"hpf_stop": 86,
"convert_channels": "stereo_n",
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 668,
"pre_filter_stop": 672
}

@ -0,0 +1,54 @@
{
"bins": 672,
"unstable_bins": 8,
"reduction_bins": 530,
"band": {
"1": {
"sr": 7350,
"hl": 80,
"n_fft": 640,
"crop_start": 0,
"crop_stop": 85,
"lpf_start": 25,
"lpf_stop": 53,
"res_type": "polyphase"
},
"2": {
"sr": 7350,
"hl": 80,
"n_fft": 320,
"crop_start": 4,
"crop_stop": 87,
"hpf_start": 25,
"hpf_stop": 12,
"lpf_start": 31,
"lpf_stop": 62,
"res_type": "polyphase"
},
"3": {
"sr": 14700,
"hl": 160,
"n_fft": 512,
"crop_start": 17,
"crop_stop": 216,
"hpf_start": 48,
"hpf_stop": 24,
"lpf_start": 139,
"lpf_stop": 210,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 480,
"n_fft": 960,
"crop_start": 78,
"crop_stop": 383,
"hpf_start": 130,
"hpf_stop": 86,
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 668,
"pre_filter_stop": 672
}

@ -0,0 +1,43 @@
{
"mid_side_b2": true,
"bins": 1280,
"unstable_bins": 7,
"reduction_bins": 565,
"band": {
"1": {
"sr": 11025,
"hl": 108,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 374,
"lpf_start": 92,
"lpf_stop": 186,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 216,
"n_fft": 1536,
"crop_start": 0,
"crop_stop": 424,
"hpf_start": 68,
"hpf_stop": 34,
"lpf_start": 348,
"lpf_stop": 418,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 432,
"n_fft": 1280,
"crop_start": 132,
"crop_stop": 614,
"hpf_start": 172,
"hpf_stop": 144,
"res_type": "polyphase"
}
},
"sr": 44100,
"pre_filter_start": 1280,
"pre_filter_stop": 1280
}

@ -0,0 +1,123 @@
import layers
import torch
import torch.nn.functional as F
from torch import nn
from . import spec_utils
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 16)
self.stg1_high_band_net = BaseASPPNet(2, 16)
self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(8, 16)
self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(16, 32)
self.out = nn.Conv2d(32, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

@ -0,0 +1,122 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
self.stg1_high_band_net = BaseASPPNet(2, 32)
self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(16, 32)
self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(32, 64)
self.out = nn.Conv2d(64, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

@ -0,0 +1,122 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
self.stg1_high_band_net = BaseASPPNet(2, 32)
self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(16, 32)
self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(32, 64)
self.out = nn.Conv2d(64, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

@ -0,0 +1,122 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import layers_33966KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 16)
self.stg1_high_band_net = BaseASPPNet(2, 16)
self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(8, 16)
self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(16, 32)
self.out = nn.Conv2d(32, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

@ -0,0 +1,123 @@
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from . import layers_537238KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64)
self.stg1_high_band_net = BaseASPPNet(2, 64)
self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(32, 64)
self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(64, 128)
self.out = nn.Conv2d(128, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

@ -0,0 +1,123 @@
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from . import layers_537238KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64)
self.stg1_high_band_net = BaseASPPNet(2, 64)
self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(32, 64)
self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(64, 128)
self.out = nn.Conv2d(128, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

@ -0,0 +1,122 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
self.stg1_high_band_net = BaseASPPNet(2, 32)
self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(16, 32)
self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(32, 64)
self.out = nn.Conv2d(64, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

@ -0,0 +1,133 @@
import torch
import torch.nn.functional as F
from torch import nn
from . import layers_new
class BaseNet(nn.Module):
def __init__(
self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
):
super(BaseNet, self).__init__()
self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
def __call__(self, x):
e1 = self.enc1(x)
e2 = self.enc2(e1)
e3 = self.enc3(e2)
e4 = self.enc4(e3)
e5 = self.enc5(e4)
h = self.aspp(e5)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = torch.cat([h, self.lstm_dec2(h)], dim=1)
h = self.dec1(h, e1)
return h
class CascadedNet(nn.Module):
def __init__(self, n_fft, nout=32, nout_lstm=128):
super(CascadedNet, self).__init__()
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.nin_lstm = self.max_bin // 2
self.offset = 64
self.stg1_low_band_net = nn.Sequential(
BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
)
self.stg1_high_band_net = BaseNet(
2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
)
self.stg2_low_band_net = nn.Sequential(
BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
)
self.stg2_high_band_net = BaseNet(
nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
)
self.stg3_full_band_net = BaseNet(
3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
)
self.out = nn.Conv2d(nout, 2, 1, bias=False)
self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
def forward(self, x):
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
l1_in = x[:, :, :bandw]
h1_in = x[:, :, bandw:]
l1 = self.stg1_low_band_net(l1_in)
h1 = self.stg1_high_band_net(h1_in)
aux1 = torch.cat([l1, h1], dim=2)
l2_in = torch.cat([l1_in, l1], dim=1)
h2_in = torch.cat([h1_in, h1], dim=1)
l2 = self.stg2_low_band_net(l2_in)
h2 = self.stg2_high_band_net(h2_in)
aux2 = torch.cat([l2, h2], dim=2)
f3_in = torch.cat([x, aux1, aux2], dim=1)
f3 = self.stg3_full_band_net(f3_in)
mask = torch.sigmoid(self.out(f3))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode="replicate",
)
if self.training:
aux = torch.cat([aux1, aux2], dim=1)
aux = torch.sigmoid(self.aux_out(aux))
aux = F.pad(
input=aux,
pad=(0, 0, 0, self.output_bin - aux.size()[2]),
mode="replicate",
)
return mask, aux
else:
return mask
def predict_mask(self, x):
mask = self.forward(x)
if self.offset > 0:
mask = mask[:, :, :, self.offset : -self.offset]
assert mask.size()[3] > 0
return mask
def predict(self, x, aggressiveness=None):
mask = self.forward(x)
pred_mag = x * mask
if self.offset > 0:
pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
assert pred_mag.size()[3] > 0
return pred_mag

@ -0,0 +1,672 @@
import hashlib
import json
import math
import os
import librosa
import numpy as np
import soundfile as sf
from tqdm import tqdm
def crop_center(h1, h2):
h1_shape = h1.size()
h2_shape = h2.size()
if h1_shape[3] == h2_shape[3]:
return h1
elif h1_shape[3] < h2_shape[3]:
raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
# s_freq = (h2_shape[2] - h1_shape[2]) // 2
# e_freq = s_freq + h1_shape[2]
s_time = (h1_shape[3] - h2_shape[3]) // 2
e_time = s_time + h2_shape[3]
h1 = h1[:, :, :, s_time:e_time]
return h1
def wave_to_spectrogram(
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
):
if reverse:
wave_left = np.flip(np.asfortranarray(wave[0]))
wave_right = np.flip(np.asfortranarray(wave[1]))
elif mid_side:
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
else:
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
spec = np.asfortranarray([spec_left, spec_right])
return spec
def wave_to_spectrogram_mt(
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
):
import threading
if reverse:
wave_left = np.flip(np.asfortranarray(wave[0]))
wave_right = np.flip(np.asfortranarray(wave[1]))
elif mid_side:
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
else:
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
def run_thread(**kwargs):
global spec_left
spec_left = librosa.stft(**kwargs)
thread = threading.Thread(
target=run_thread,
kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
)
thread.start()
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
thread.join()
spec = np.asfortranarray([spec_left, spec_right])
return spec
def combine_spectrograms(specs, mp):
l = min([specs[i].shape[2] for i in specs])
spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
offset = 0
bands_n = len(mp.param["band"])
for d in range(1, bands_n + 1):
h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
spec_c[:, offset : offset + h, :l] = specs[d][
:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
]
offset += h
if offset > mp.param["bins"]:
raise ValueError("Too much bins")
# lowpass fiter
if (
mp.param["pre_filter_start"] > 0
): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if bands_n == 1:
spec_c = fft_lp_filter(
spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
)
else:
gp = 1
for b in range(
mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
):
g = math.pow(
10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
)
gp = g
spec_c[:, b, :] *= g
return np.asfortranarray(spec_c)
def spectrogram_to_image(spec, mode="magnitude"):
if mode == "magnitude":
if np.iscomplexobj(spec):
y = np.abs(spec)
else:
y = spec
y = np.log10(y**2 + 1e-8)
elif mode == "phase":
if np.iscomplexobj(spec):
y = np.angle(spec)
else:
y = spec
y -= y.min()
y *= 255 / y.max()
img = np.uint8(y)
if y.ndim == 3:
img = img.transpose(1, 2, 0)
img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
return img
def reduce_vocal_aggressively(X, y, softmask):
v = X - y
y_mag_tmp = np.abs(y)
v_mag_tmp = np.abs(v)
v_mask = v_mag_tmp > y_mag_tmp
y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
return y_mag * np.exp(1.0j * np.angle(y))
def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
if min_range < fade_size * 2:
raise ValueError("min_range must be >= fade_area * 2")
mag = mag.copy()
idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
uninformative = np.where(ends - starts > min_range)[0]
if len(uninformative) > 0:
starts = starts[uninformative]
ends = ends[uninformative]
old_e = None
for s, e in zip(starts, ends):
if old_e is not None and s - old_e < fade_size:
s = old_e - fade_size * 2
if s != 0:
weight = np.linspace(0, 1, fade_size)
mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
else:
s -= fade_size
if e != mag.shape[2]:
weight = np.linspace(1, 0, fade_size)
mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
else:
e += fade_size
mag[:, :, s + fade_size : e - fade_size] += ref[
:, :, s + fade_size : e - fade_size
]
old_e = e
return mag
def align_wave_head_and_tail(a, b):
l = min([a[0].size, b[0].size])
return a[:l, :l], b[:l, :l]
def cache_or_load(mix_path, inst_path, mp):
mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
cache_dir = "mph{}".format(
hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
)
mix_cache_dir = os.path.join("cache", cache_dir)
inst_cache_dir = os.path.join("cache", cache_dir)
os.makedirs(mix_cache_dir, exist_ok=True)
os.makedirs(inst_cache_dir, exist_ok=True)
mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
X_spec_m = np.load(mix_cache_path)
y_spec_m = np.load(inst_cache_path)
else:
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
for d in range(len(mp.param["band"]), 0, -1):
bp = mp.param["band"][d]
if d == len(mp.param["band"]): # high-end band
X_wave[d], _ = librosa.load(
mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
)
y_wave[d], _ = librosa.load(
inst_path,
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
else: # lower bands
X_wave[d] = librosa.resample(
X_wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
y_wave[d] = librosa.resample(
y_wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
X_spec_s[d] = wave_to_spectrogram(
X_wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
y_spec_s[d] = wave_to_spectrogram(
y_wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
del X_wave, y_wave
X_spec_m = combine_spectrograms(X_spec_s, mp)
y_spec_m = combine_spectrograms(y_spec_s, mp)
if X_spec_m.shape != y_spec_m.shape:
raise ValueError("The combined spectrograms are different: " + mix_path)
_, ext = os.path.splitext(mix_path)
np.save(mix_cache_path, X_spec_m)
np.save(inst_cache_path, y_spec_m)
return X_spec_m, y_spec_m
def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
wave_left = librosa.istft(spec_left, hop_length=hop_length)
wave_right = librosa.istft(spec_right, hop_length=hop_length)
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray(
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
)
elif mid_side_b2:
return np.asfortranarray(
[
np.add(wave_right / 1.25, 0.4 * wave_left),
np.subtract(wave_left / 1.25, 0.4 * wave_right),
]
)
else:
return np.asfortranarray([wave_left, wave_right])
def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
import threading
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
def run_thread(**kwargs):
global wave_left
wave_left = librosa.istft(**kwargs)
thread = threading.Thread(
target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
)
thread.start()
wave_right = librosa.istft(spec_right, hop_length=hop_length)
thread.join()
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray(
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
)
elif mid_side_b2:
return np.asfortranarray(
[
np.add(wave_right / 1.25, 0.4 * wave_left),
np.subtract(wave_left / 1.25, 0.4 * wave_right),
]
)
else:
return np.asfortranarray([wave_left, wave_right])
def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
wave_band = {}
bands_n = len(mp.param["band"])
offset = 0
for d in range(1, bands_n + 1):
bp = mp.param["band"][d]
spec_s = np.ndarray(
shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
)
h = bp["crop_stop"] - bp["crop_start"]
spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
:, offset : offset + h, :
]
offset += h
if d == bands_n: # higher
if extra_bins_h: # if --high_end_process bypass
max_bin = bp["n_fft"] // 2
spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
:, :extra_bins_h, :
]
if bp["hpf_start"] > 0:
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
if bands_n == 1:
wave = spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
else:
wave = np.add(
wave,
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
)
else:
sr = mp.param["band"][d + 1]["sr"]
if d == 1: # lower
spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
wave = librosa.resample(
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
bp["sr"],
sr,
res_type="sinc_fastest",
)
else: # mid
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
wave2 = np.add(
wave,
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
)
# wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")
return wave.T
def fft_lp_filter(spec, bin_start, bin_stop):
g = 1.0
for b in range(bin_start, bin_stop):
g -= 1 / (bin_stop - bin_start)
spec[:, b, :] = g * spec[:, b, :]
spec[:, bin_stop:, :] *= 0
return spec
def fft_hp_filter(spec, bin_start, bin_stop):
g = 1.0
for b in range(bin_start, bin_stop, -1):
g -= 1 / (bin_start - bin_stop)
spec[:, b, :] = g * spec[:, b, :]
spec[:, 0 : bin_stop + 1, :] *= 0
return spec
def mirroring(a, spec_m, input_high_end, mp):
if "mirroring" == a:
mirror = np.flip(
np.abs(
spec_m[
:,
mp.param["pre_filter_start"]
- 10
- input_high_end.shape[1] : mp.param["pre_filter_start"]
- 10,
:,
]
),
1,
)
mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
return np.where(
np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
)
if "mirroring2" == a:
mirror = np.flip(
np.abs(
spec_m[
:,
mp.param["pre_filter_start"]
- 10
- input_high_end.shape[1] : mp.param["pre_filter_start"]
- 10,
:,
]
),
1,
)
mi = np.multiply(mirror, input_high_end * 1.7)
return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
def ensembling(a, specs):
for i in range(1, len(specs)):
if i == 1:
spec = specs[0]
ln = min([spec.shape[2], specs[i].shape[2]])
spec = spec[:, :, :ln]
specs[i] = specs[i][:, :, :ln]
if "min_mag" == a:
spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
if "max_mag" == a:
spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
return spec
def stft(wave, nfft, hl):
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
spec = np.asfortranarray([spec_left, spec_right])
return spec
def istft(spec, hl):
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
wave_left = librosa.istft(spec_left, hop_length=hl)
wave_right = librosa.istft(spec_right, hop_length=hl)
wave = np.asfortranarray([wave_left, wave_right])
if __name__ == "__main__":
import argparse
import sys
import time
import cv2
from model_param_init import ModelParameters
p = argparse.ArgumentParser()
p.add_argument(
"--algorithm",
"-a",
type=str,
choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
default="min_mag",
)
p.add_argument(
"--model_params",
"-m",
type=str,
default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
)
p.add_argument("--output_name", "-o", type=str, default="output")
p.add_argument("--vocals_only", "-v", action="store_true")
p.add_argument("input", nargs="+")
args = p.parse_args()
start_time = time.time()
if args.algorithm.startswith("invert") and len(args.input) != 2:
raise ValueError("There should be two input files.")
if not args.algorithm.startswith("invert") and len(args.input) < 2:
raise ValueError("There must be at least two input files.")
wave, specs = {}, {}
mp = ModelParameters(args.model_params)
for i in range(len(args.input)):
spec = {}
for d in range(len(mp.param["band"]), 0, -1):
bp = mp.param["band"][d]
if d == len(mp.param["band"]): # high-end band
wave[d], _ = librosa.load(
args.input[i],
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
if len(wave[d].shape) == 1: # mono to stereo
wave[d] = np.array([wave[d], wave[d]])
else: # lower bands
wave[d] = librosa.resample(
wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
spec[d] = wave_to_spectrogram(
wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
specs[i] = combine_spectrograms(spec, mp)
del wave
if args.algorithm == "deep":
d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
v_spec = d_spec - specs[1]
sf.write(
os.path.join("{}.wav".format(args.output_name)),
cmb_spectrogram_to_wave(v_spec, mp),
mp.param["sr"],
)
if args.algorithm.startswith("invert"):
ln = min([specs[0].shape[2], specs[1].shape[2]])
specs[0] = specs[0][:, :, :ln]
specs[1] = specs[1][:, :, :ln]
if "invert_p" == args.algorithm:
X_mag = np.abs(specs[0])
y_mag = np.abs(specs[1])
max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
else:
specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
v_spec = specs[0] - specs[1]
if not args.vocals_only:
X_mag = np.abs(specs[0])
y_mag = np.abs(specs[1])
v_mag = np.abs(v_spec)
X_image = spectrogram_to_image(X_mag)
y_image = spectrogram_to_image(y_mag)
v_image = spectrogram_to_image(v_mag)
cv2.imwrite("{}_X.png".format(args.output_name), X_image)
cv2.imwrite("{}_y.png".format(args.output_name), y_image)
cv2.imwrite("{}_v.png".format(args.output_name), v_image)
sf.write(
"{}_X.wav".format(args.output_name),
cmb_spectrogram_to_wave(specs[0], mp),
mp.param["sr"],
)
sf.write(
"{}_y.wav".format(args.output_name),
cmb_spectrogram_to_wave(specs[1], mp),
mp.param["sr"],
)
sf.write(
"{}_v.wav".format(args.output_name),
cmb_spectrogram_to_wave(v_spec, mp),
mp.param["sr"],
)
else:
if not args.algorithm == "deep":
sf.write(
os.path.join("ensembled", "{}.wav".format(args.output_name)),
cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
mp.param["sr"],
)
if args.algorithm == "align":
trackalignment = [
{
"file1": '"{}"'.format(args.input[0]),
"file2": '"{}"'.format(args.input[1]),
}
]
for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
# print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))

@ -0,0 +1,263 @@
{
"equivalent" : [
{
"model_hash_name" : [
{
"hash_name": "47939caf0cfe52a0e81442b85b971dfd",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "a82f14e75892e55e994376edbf0c8435",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "08611fb99bd59eaa79ad27c58d137727",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "5c7bbca45a187e81abbbd351606164e5",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
},
{
"hash_name": "d6b2cb685a058a091e5e7098192d3233",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
},
{
"hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "c3448ec923fa0edf3d03a19e633faa53",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "68aa2c8093d0080704b200d140f59e54",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json",
"param_name": "3band_44100"
},
{
"hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid.json"
},
{
"hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid.json"
},
{
"hash_name": "52fdca89576f06cf4340b74a4730ee5f",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100.json"
},
{
"hash_name": "41191165b05d38fc77f072fa9e8e8a30",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100.json"
},
{
"hash_name": "89e83b511ad474592689e562d5b1f80e",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000.json"
},
{
"hash_name": "0b954da81d453b716b114d6d7c95177f",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000.json"
}
],
"v4 Models": [
{
"hash_name": "6a00461c51c2920fd68937d4609ed6c8",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "0ab504864d20f1bd378fe9c81ef37140",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "80ab74d65e515caa3622728d2de07d23",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "edc115e7fc523245062200c00caa847f",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "b58090534c52cbc3e9b5104bad666ef2",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "ae702fed0238afb5346db8356fe25f13",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
"param_name": "1band_sr44100_hl1024"
}
]
}
],
"User Models" : [
{
"1 Band": [
{
"hash_name": "1band_sr16000_hl512",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "1band_sr32000_hl512",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "1band_sr33075_hl384",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "1band_sr44100_hl256",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json",
"param_name": "1band_sr44100_hl256"
},
{
"hash_name": "1band_sr44100_hl512",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "1band_sr44100_hl1024",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
"param_name": "1band_sr44100_hl1024"
}
],
"2 Band": [
{
"hash_name": "2band_44100_lofi",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json",
"param_name": "2band_44100_lofi"
},
{
"hash_name": "2band_32000",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000"
},
{
"hash_name": "2band_48000",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json",
"param_name": "2band_48000"
}
],
"3 Band": [
{
"hash_name": "3band_44100",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json",
"param_name": "3band_44100"
},
{
"hash_name": "3band_44100_mid",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid"
},
{
"hash_name": "3band_44100_msb2",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
}
],
"4 Band": [
{
"hash_name": "4band_44100",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "4band_44100_mid",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json",
"param_name": "4band_44100_mid"
},
{
"hash_name": "4band_44100_msb",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json",
"param_name": "4band_44100_msb"
},
{
"hash_name": "4band_44100_msb2",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json",
"param_name": "4band_44100_msb2"
},
{
"hash_name": "4band_44100_reverse",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json",
"param_name": "4band_44100_reverse"
},
{
"hash_name": "4band_44100_sw",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json",
"param_name": "4band_44100_sw"
},
{
"hash_name": "4band_v2",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "4band_v2_sn",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "tmodelparam",
"model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json",
"param_name": "User Model Param Set"
}
]
}
]
}

@ -0,0 +1,121 @@
import json
import numpy as np
import torch
from tqdm import tqdm
def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict:
with open(file_name, "r") as f:
data = json.load(f)
return data
def make_padding(width, cropsize, offset):
left = offset
roi_size = cropsize - left * 2
if roi_size == 0:
roi_size = cropsize
right = roi_size - (width % roi_size) + left
return left, right, roi_size
def inference(X_spec, device, model, aggressiveness, data):
"""
data dic configs
"""
def _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
):
model.eval()
with torch.no_grad():
preds = []
iterations = [n_window]
total_iterations = sum(iterations)
for i in tqdm(range(n_window)):
start = i * roi_size
X_mag_window = X_mag_pad[
None, :, :, start : start + data["window_size"]
]
X_mag_window = torch.from_numpy(X_mag_window)
if is_half:
X_mag_window = X_mag_window.half()
X_mag_window = X_mag_window.to(device)
pred = model.predict(X_mag_window, aggressiveness)
pred = pred.detach().cpu().numpy()
preds.append(pred[0])
pred = np.concatenate(preds, axis=2)
return pred
def preprocess(X_spec):
X_mag = np.abs(X_spec)
X_phase = np.angle(X_spec)
return X_mag, X_phase
X_mag, X_phase = preprocess(X_spec)
coef = X_mag.max()
X_mag_pre = X_mag / coef
n_frame = X_mag_pre.shape[2]
pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
n_window = int(np.ceil(n_frame / roi_size))
X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
if list(model.state_dict().values())[0].dtype == torch.float16:
is_half = True
else:
is_half = False
pred = _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
)
pred = pred[:, :, :n_frame]
if data["tta"]:
pad_l += roi_size // 2
pad_r += roi_size // 2
n_window += 1
X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
pred_tta = _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
)
pred_tta = pred_tta[:, :, roi_size // 2 :]
pred_tta = pred_tta[:, :, :n_frame]
return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
else:
return pred * coef, X_mag, np.exp(1.0j * X_phase)
def _get_name_params(model_path, model_hash):
data = load_data()
flag = False
ModelName = model_path
for type in list(data):
for model in list(data[type][0]):
for i in range(len(data[type][0][model])):
if str(data[type][0][model][i]["hash_name"]) == model_hash:
flag = True
elif str(data[type][0][model][i]["hash_name"]) in ModelName:
flag = True
if flag:
model_params_auto = data[type][0][model][i]["model_params"]
param_name_auto = data[type][0][model][i]["param_name"]
if type == "equivalent":
return param_name_auto, model_params_auto
else:
flag = False
return param_name_auto, model_params_auto

@ -0,0 +1,190 @@
import os
import sys
import contextlib
import torch
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
from .hijacks import ipex_hijacks
from .attention import attention_init
# pylint: disable=protected-access, missing-function-docstring, line-too-long
def ipex_init(): # pylint: disable=too-many-statements
try:
# Replace cuda with xpu:
torch.cuda.current_device = torch.xpu.current_device
torch.cuda.current_stream = torch.xpu.current_stream
torch.cuda.device = torch.xpu.device
torch.cuda.device_count = torch.xpu.device_count
torch.cuda.device_of = torch.xpu.device_of
torch.cuda.get_device_name = torch.xpu.get_device_name
torch.cuda.get_device_properties = torch.xpu.get_device_properties
torch.cuda.init = torch.xpu.init
torch.cuda.is_available = torch.xpu.is_available
torch.cuda.is_initialized = torch.xpu.is_initialized
torch.cuda.is_current_stream_capturing = lambda: False
torch.cuda.set_device = torch.xpu.set_device
torch.cuda.stream = torch.xpu.stream
torch.cuda.synchronize = torch.xpu.synchronize
torch.cuda.Event = torch.xpu.Event
torch.cuda.Stream = torch.xpu.Stream
torch.cuda.FloatTensor = torch.xpu.FloatTensor
torch.Tensor.cuda = torch.Tensor.xpu
torch.Tensor.is_cuda = torch.Tensor.is_xpu
torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
torch.cuda._initialized = torch.xpu.lazy_init._initialized
torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker
torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls
torch.cuda._tls = torch.xpu.lazy_init._tls
torch.cuda.threading = torch.xpu.lazy_init.threading
torch.cuda.traceback = torch.xpu.lazy_init.traceback
torch.cuda.Optional = torch.xpu.Optional
torch.cuda.__cached__ = torch.xpu.__cached__
torch.cuda.__loader__ = torch.xpu.__loader__
torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage
torch.cuda.Tuple = torch.xpu.Tuple
torch.cuda.streams = torch.xpu.streams
torch.cuda._lazy_new = torch.xpu._lazy_new
torch.cuda.FloatStorage = torch.xpu.FloatStorage
torch.cuda.Any = torch.xpu.Any
torch.cuda.__doc__ = torch.xpu.__doc__
torch.cuda.default_generators = torch.xpu.default_generators
torch.cuda.HalfTensor = torch.xpu.HalfTensor
torch.cuda._get_device_index = torch.xpu._get_device_index
torch.cuda.__path__ = torch.xpu.__path__
torch.cuda.Device = torch.xpu.Device
torch.cuda.IntTensor = torch.xpu.IntTensor
torch.cuda.ByteStorage = torch.xpu.ByteStorage
torch.cuda.set_stream = torch.xpu.set_stream
torch.cuda.BoolStorage = torch.xpu.BoolStorage
torch.cuda.os = torch.xpu.os
torch.cuda.torch = torch.xpu.torch
torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage
torch.cuda.Union = torch.xpu.Union
torch.cuda.DoubleTensor = torch.xpu.DoubleTensor
torch.cuda.ShortTensor = torch.xpu.ShortTensor
torch.cuda.LongTensor = torch.xpu.LongTensor
torch.cuda.IntStorage = torch.xpu.IntStorage
torch.cuda.LongStorage = torch.xpu.LongStorage
torch.cuda.__annotations__ = torch.xpu.__annotations__
torch.cuda.__package__ = torch.xpu.__package__
torch.cuda.__builtins__ = torch.xpu.__builtins__
torch.cuda.CharTensor = torch.xpu.CharTensor
torch.cuda.List = torch.xpu.List
torch.cuda._lazy_init = torch.xpu._lazy_init
torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor
torch.cuda.DoubleStorage = torch.xpu.DoubleStorage
torch.cuda.ByteTensor = torch.xpu.ByteTensor
torch.cuda.StreamContext = torch.xpu.StreamContext
torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage
torch.cuda.ShortStorage = torch.xpu.ShortStorage
torch.cuda._lazy_call = torch.xpu._lazy_call
torch.cuda.HalfStorage = torch.xpu.HalfStorage
torch.cuda.random = torch.xpu.random
torch.cuda._device = torch.xpu._device
torch.cuda.classproperty = torch.xpu.classproperty
torch.cuda.__name__ = torch.xpu.__name__
torch.cuda._device_t = torch.xpu._device_t
torch.cuda.warnings = torch.xpu.warnings
torch.cuda.__spec__ = torch.xpu.__spec__
torch.cuda.BoolTensor = torch.xpu.BoolTensor
torch.cuda.CharStorage = torch.xpu.CharStorage
torch.cuda.__file__ = torch.xpu.__file__
torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork
# torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
# Memory:
torch.cuda.memory = torch.xpu.memory
if "linux" in sys.platform and "WSL2" in os.popen("uname -a").read():
torch.xpu.empty_cache = lambda: None
torch.cuda.empty_cache = torch.xpu.empty_cache
torch.cuda.memory_stats = torch.xpu.memory_stats
torch.cuda.memory_summary = torch.xpu.memory_summary
torch.cuda.memory_snapshot = torch.xpu.memory_snapshot
torch.cuda.memory_allocated = torch.xpu.memory_allocated
torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
torch.cuda.memory_reserved = torch.xpu.memory_reserved
torch.cuda.memory_cached = torch.xpu.memory_reserved
torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
torch.cuda.reset_accumulated_memory_stats = (
torch.xpu.reset_accumulated_memory_stats
)
# RNG:
torch.cuda.get_rng_state = torch.xpu.get_rng_state
torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all
torch.cuda.set_rng_state = torch.xpu.set_rng_state
torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all
torch.cuda.manual_seed = torch.xpu.manual_seed
torch.cuda.manual_seed_all = torch.xpu.manual_seed_all
torch.cuda.seed = torch.xpu.seed
torch.cuda.seed_all = torch.xpu.seed_all
torch.cuda.initial_seed = torch.xpu.initial_seed
# AMP:
torch.cuda.amp = torch.xpu.amp
if not hasattr(torch.cuda.amp, "common"):
torch.cuda.amp.common = contextlib.nullcontext()
torch.cuda.amp.common.amp_definitely_not_available = lambda: False
try:
torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
except Exception: # pylint: disable=broad-exception-caught
try:
from .gradscaler import (
gradscaler_init,
) # pylint: disable=import-outside-toplevel, import-error
gradscaler_init()
torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
except Exception: # pylint: disable=broad-exception-caught
torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
# C
torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
ipex._C._DeviceProperties.major = 2023
ipex._C._DeviceProperties.minor = 2
# Fix functions with ipex:
torch.cuda.mem_get_info = lambda device=None: [
(
torch.xpu.get_device_properties(device).total_memory
- torch.xpu.memory_allocated(device)
),
torch.xpu.get_device_properties(device).total_memory,
]
torch._utils._get_available_device_type = lambda: "xpu"
torch.has_cuda = True
torch.cuda.has_half = True
torch.cuda.is_bf16_supported = lambda *args, **kwargs: True
torch.cuda.is_fp16_supported = lambda *args, **kwargs: True
torch.version.cuda = "11.7"
torch.cuda.get_device_capability = lambda *args, **kwargs: [11, 7]
torch.cuda.get_device_properties.major = 11
torch.cuda.get_device_properties.minor = 7
torch.cuda.ipc_collect = lambda *args, **kwargs: None
torch.cuda.utilization = lambda *args, **kwargs: 0
if hasattr(torch.xpu, "getDeviceIdListForCard"):
torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
else:
torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card
torch.cuda.get_device_id_list_per_card = (
torch.xpu.get_device_id_list_per_card
)
ipex_hijacks()
attention_init()
try:
from .diffusers import ipex_diffusers
ipex_diffusers()
except Exception: # pylint: disable=broad-exception-caught
pass
except Exception as e:
return False, e
return True, None

@ -0,0 +1,212 @@
import torch
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
# pylint: disable=protected-access, missing-function-docstring, line-too-long
original_torch_bmm = torch.bmm
def torch_bmm(input, mat2, *, out=None):
if input.dtype != mat2.dtype:
mat2 = mat2.to(input.dtype)
# ARC GPUs can't allocate more than 4GB to a single block, Slice it:
batch_size_attention, input_tokens, mat2_shape = (
input.shape[0],
input.shape[1],
mat2.shape[2],
)
block_multiply = input.element_size()
slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply
block_size = batch_size_attention * slice_block_size
split_slice_size = batch_size_attention
if block_size > 4:
do_split = True
# Find something divisible with the input_tokens
while (split_slice_size * slice_block_size) > 4:
split_slice_size = split_slice_size // 2
if split_slice_size <= 1:
split_slice_size = 1
break
else:
do_split = False
split_2_slice_size = input_tokens
if split_slice_size * slice_block_size > 4:
slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply
do_split_2 = True
# Find something divisible with the input_tokens
while (split_2_slice_size * slice_block_size2) > 4:
split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1:
split_2_slice_size = 1
break
else:
do_split_2 = False
if do_split:
hidden_states = torch.zeros(
input.shape[0],
input.shape[1],
mat2.shape[2],
device=input.device,
dtype=input.dtype,
)
for i in range(batch_size_attention // split_slice_size):
start_idx = i * split_slice_size
end_idx = (i + 1) * split_slice_size
if do_split_2:
for i2 in range(
input_tokens // split_2_slice_size
): # pylint: disable=invalid-name
start_idx_2 = i2 * split_2_slice_size
end_idx_2 = (i2 + 1) * split_2_slice_size
hidden_states[
start_idx:end_idx, start_idx_2:end_idx_2
] = original_torch_bmm(
input[start_idx:end_idx, start_idx_2:end_idx_2],
mat2[start_idx:end_idx, start_idx_2:end_idx_2],
out=out,
)
else:
hidden_states[start_idx:end_idx] = original_torch_bmm(
input[start_idx:end_idx], mat2[start_idx:end_idx], out=out
)
else:
return original_torch_bmm(input, mat2, out=out)
return hidden_states
original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
def scaled_dot_product_attention(
query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False
):
# ARC GPUs can't allocate more than 4GB to a single block, Slice it:
if len(query.shape) == 3:
batch_size_attention, query_tokens, shape_four = query.shape
shape_one = 1
no_shape_one = True
else:
shape_one, batch_size_attention, query_tokens, shape_four = query.shape
no_shape_one = False
block_multiply = query.element_size()
slice_block_size = (
shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply
)
block_size = batch_size_attention * slice_block_size
split_slice_size = batch_size_attention
if block_size > 4:
do_split = True
# Find something divisible with the shape_one
while (split_slice_size * slice_block_size) > 4:
split_slice_size = split_slice_size // 2
if split_slice_size <= 1:
split_slice_size = 1
break
else:
do_split = False
split_2_slice_size = query_tokens
if split_slice_size * slice_block_size > 4:
slice_block_size2 = (
shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply
)
do_split_2 = True
# Find something divisible with the batch_size_attention
while (split_2_slice_size * slice_block_size2) > 4:
split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1:
split_2_slice_size = 1
break
else:
do_split_2 = False
if do_split:
hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype)
for i in range(batch_size_attention // split_slice_size):
start_idx = i * split_slice_size
end_idx = (i + 1) * split_slice_size
if do_split_2:
for i2 in range(
query_tokens // split_2_slice_size
): # pylint: disable=invalid-name
start_idx_2 = i2 * split_2_slice_size
end_idx_2 = (i2 + 1) * split_2_slice_size
if no_shape_one:
hidden_states[
start_idx:end_idx, start_idx_2:end_idx_2
] = original_scaled_dot_product_attention(
query[start_idx:end_idx, start_idx_2:end_idx_2],
key[start_idx:end_idx, start_idx_2:end_idx_2],
value[start_idx:end_idx, start_idx_2:end_idx_2],
attn_mask=attn_mask[
start_idx:end_idx, start_idx_2:end_idx_2
]
if attn_mask is not None
else attn_mask,
dropout_p=dropout_p,
is_causal=is_causal,
)
else:
hidden_states[
:, start_idx:end_idx, start_idx_2:end_idx_2
] = original_scaled_dot_product_attention(
query[:, start_idx:end_idx, start_idx_2:end_idx_2],
key[:, start_idx:end_idx, start_idx_2:end_idx_2],
value[:, start_idx:end_idx, start_idx_2:end_idx_2],
attn_mask=attn_mask[
:, start_idx:end_idx, start_idx_2:end_idx_2
]
if attn_mask is not None
else attn_mask,
dropout_p=dropout_p,
is_causal=is_causal,
)
else:
if no_shape_one:
hidden_states[
start_idx:end_idx
] = original_scaled_dot_product_attention(
query[start_idx:end_idx],
key[start_idx:end_idx],
value[start_idx:end_idx],
attn_mask=attn_mask[start_idx:end_idx]
if attn_mask is not None
else attn_mask,
dropout_p=dropout_p,
is_causal=is_causal,
)
else:
hidden_states[
:, start_idx:end_idx
] = original_scaled_dot_product_attention(
query[:, start_idx:end_idx],
key[:, start_idx:end_idx],
value[:, start_idx:end_idx],
attn_mask=attn_mask[:, start_idx:end_idx]
if attn_mask is not None
else attn_mask,
dropout_p=dropout_p,
is_causal=is_causal,
)
else:
return original_scaled_dot_product_attention(
query,
key,
value,
attn_mask=attn_mask,
dropout_p=dropout_p,
is_causal=is_causal,
)
return hidden_states
def attention_init():
# ARC GPUs can't allocate more than 4GB to a single block:
torch.bmm = torch_bmm
torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention

@ -0,0 +1,187 @@
from collections import defaultdict
import torch
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
import intel_extension_for_pytorch._C as core # pylint: disable=import-error, unused-import
# pylint: disable=protected-access, missing-function-docstring, line-too-long
OptState = ipex.cpu.autocast._grad_scaler.OptState
_MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator
_refresh_per_optimizer_state = (
ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state
)
def _unscale_grads_(
self, optimizer, inv_scale, found_inf, allow_fp16
): # pylint: disable=unused-argument
per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
per_device_found_inf = _MultiDeviceReplicator(found_inf)
# To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
# There could be hundreds of grads, so we'd like to iterate through them just once.
# However, we don't know their devices or dtypes in advance.
# https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
# Google says mypy struggles with defaultdicts type annotations.
per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list)) # type: ignore[var-annotated]
# sync grad to master weight
if hasattr(optimizer, "sync_grad"):
optimizer.sync_grad()
with torch.no_grad():
for group in optimizer.param_groups:
for param in group["params"]:
if param.grad is None:
continue
if (not allow_fp16) and param.grad.dtype == torch.float16:
raise ValueError("Attempting to unscale FP16 gradients.")
if param.grad.is_sparse:
# is_coalesced() == False means the sparse grad has values with duplicate indices.
# coalesce() deduplicates indices and adds all values that have the same index.
# For scaled fp16 values, there's a good chance coalescing will cause overflow,
# so we should check the coalesced _values().
if param.grad.dtype is torch.float16:
param.grad = param.grad.coalesce()
to_unscale = param.grad._values()
else:
to_unscale = param.grad
# -: is there a way to split by device and dtype without appending in the inner loop?
to_unscale = to_unscale.to("cpu")
per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(
to_unscale
)
for _, per_dtype_grads in per_device_and_dtype_grads.items():
for grads in per_dtype_grads.values():
core._amp_foreach_non_finite_check_and_unscale_(
grads,
per_device_found_inf.get("cpu"),
per_device_inv_scale.get("cpu"),
)
return per_device_found_inf._per_device_tensors
def unscale_(self, optimizer):
"""
Divides ("unscales") the optimizer's gradient tensors by the scale factor.
:meth:`unscale_` is optional, serving cases where you need to
:ref:`modify or inspect gradients<working-with-unscaled-gradients>`
between the backward pass(es) and :meth:`step`.
If :meth:`unscale_` is not called explicitly, gradients will be unscaled automatically during :meth:`step`.
Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
...
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
scaler.step(optimizer)
scaler.update()
Args:
optimizer (torch.optim.Optimizer): Optimizer that owns the gradients to be unscaled.
.. warning::
:meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
and only after all gradients for that optimizer's assigned parameters have been accumulated.
Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
.. warning::
:meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
"""
if not self._enabled:
return
self._check_scale_growth_tracker("unscale_")
optimizer_state = self._per_optimizer_states[id(optimizer)]
if optimizer_state["stage"] is OptState.UNSCALED: # pylint: disable=no-else-raise
raise RuntimeError(
"unscale_() has already been called on this optimizer since the last update()."
)
elif optimizer_state["stage"] is OptState.STEPPED:
raise RuntimeError("unscale_() is being called after step().")
# FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
assert self._scale is not None
inv_scale = (
self._scale.to("cpu").double().reciprocal().float().to(self._scale.device)
)
found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
optimizer_state["found_inf_per_device"] = self._unscale_grads_(
optimizer, inv_scale, found_inf, False
)
optimizer_state["stage"] = OptState.UNSCALED
def update(self, new_scale=None):
"""
Updates the scale factor.
If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
the scale is multiplied by ``growth_factor`` to increase it.
Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
used directly, it's used to fill GradScaler's internal scale tensor. So if
``new_scale`` was a tensor, later in-place changes to that tensor will not further
affect the scale GradScaler uses internally.)
Args:
new_scale (float or :class:`torch.FloatTensor`, optional, default=None): New scale factor.
.. warning::
:meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
been invoked for all optimizers used this iteration.
"""
if not self._enabled:
return
_scale, _growth_tracker = self._check_scale_growth_tracker("update")
if new_scale is not None:
# Accept a new user-defined scale.
if isinstance(new_scale, float):
self._scale.fill_(new_scale) # type: ignore[union-attr]
else:
reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False."
assert isinstance(new_scale, torch.FloatTensor), reason # type: ignore[attr-defined]
assert new_scale.numel() == 1, reason
assert new_scale.requires_grad is False, reason
self._scale.copy_(new_scale) # type: ignore[union-attr]
else:
# Consume shared inf/nan data collected from optimizers to update the scale.
# If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
found_infs = [
found_inf.to(device="cpu", non_blocking=True)
for state in self._per_optimizer_states.values()
for found_inf in state["found_inf_per_device"].values()
]
assert len(found_infs) > 0, "No inf checks were recorded prior to update."
found_inf_combined = found_infs[0]
if len(found_infs) > 1:
for i in range(1, len(found_infs)):
found_inf_combined += found_infs[i]
to_device = _scale.device
_scale = _scale.to("cpu")
_growth_tracker = _growth_tracker.to("cpu")
core._amp_update_scale_(
_scale,
_growth_tracker,
found_inf_combined,
self._growth_factor,
self._backoff_factor,
self._growth_interval,
)
_scale = _scale.to(to_device)
_growth_tracker = _growth_tracker.to(to_device)
# To prepare for next iteration, clear the data collected from optimizers this iteration.
self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
def gradscaler_init():
torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_
torch.xpu.amp.GradScaler.unscale_ = unscale_
torch.xpu.amp.GradScaler.update = update
return torch.xpu.amp.GradScaler

@ -0,0 +1,357 @@
import contextlib
import importlib
import torch
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return
class CondFunc: # pylint: disable=missing-class-docstring
def __new__(cls, orig_func, sub_func, cond_func):
self = super(CondFunc, cls).__new__(cls)
if isinstance(orig_func, str):
func_path = orig_func.split(".")
for i in range(len(func_path) - 1, -1, -1):
try:
resolved_obj = importlib.import_module(".".join(func_path[:i]))
break
except ImportError:
pass
for attr_name in func_path[i:-1]:
resolved_obj = getattr(resolved_obj, attr_name)
orig_func = getattr(resolved_obj, func_path[-1])
setattr(
resolved_obj,
func_path[-1],
lambda *args, **kwargs: self(*args, **kwargs),
)
self.__init__(orig_func, sub_func, cond_func)
return lambda *args, **kwargs: self(*args, **kwargs)
def __init__(self, orig_func, sub_func, cond_func):
self.__orig_func = orig_func
self.__sub_func = sub_func
self.__cond_func = cond_func
def __call__(self, *args, **kwargs):
if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
return self.__sub_func(self.__orig_func, *args, **kwargs)
else:
return self.__orig_func(*args, **kwargs)
_utils = torch.utils.data._utils
def _shutdown_workers(self):
if (
torch.utils.data._utils is None
or torch.utils.data._utils.python_exit_status is True
or torch.utils.data._utils.python_exit_status is None
):
return
if hasattr(self, "_shutdown") and not self._shutdown:
self._shutdown = True
try:
if hasattr(self, "_pin_memory_thread"):
self._pin_memory_thread_done_event.set()
self._worker_result_queue.put((None, None))
self._pin_memory_thread.join()
self._worker_result_queue.cancel_join_thread()
self._worker_result_queue.close()
self._workers_done_event.set()
for worker_id in range(len(self._workers)):
if self._persistent_workers or self._workers_status[worker_id]:
self._mark_worker_as_unavailable(worker_id, shutdown=True)
for w in self._workers: # pylint: disable=invalid-name
w.join(timeout=torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL)
for q in self._index_queues: # pylint: disable=invalid-name
q.cancel_join_thread()
q.close()
finally:
if self._worker_pids_set:
torch.utils.data._utils.signal_handling._remove_worker_pids(id(self))
self._worker_pids_set = False
for w in self._workers: # pylint: disable=invalid-name
if w.is_alive():
w.terminate()
class DummyDataParallel(
torch.nn.Module
): # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods
def __new__(
cls, module, device_ids=None, output_device=None, dim=0
): # pylint: disable=unused-argument
if isinstance(device_ids, list) and len(device_ids) > 1:
print("IPEX backend doesn't support DataParallel on multiple XPU devices")
return module.to("xpu")
def return_null_context(*args, **kwargs): # pylint: disable=unused-argument
return contextlib.nullcontext()
def check_device(device):
return bool(
(isinstance(device, torch.device) and device.type == "cuda")
or (isinstance(device, str) and "cuda" in device)
or isinstance(device, int)
)
def return_xpu(device):
return (
f"xpu:{device[-1]}"
if isinstance(device, str) and ":" in device
else f"xpu:{device}"
if isinstance(device, int)
else torch.device("xpu")
if isinstance(device, torch.device)
else "xpu"
)
def ipex_no_cuda(orig_func, *args, **kwargs):
torch.cuda.is_available = lambda: False
orig_func(*args, **kwargs)
torch.cuda.is_available = torch.xpu.is_available
original_autocast = torch.autocast
def ipex_autocast(*args, **kwargs):
if len(args) > 0 and args[0] == "cuda":
return original_autocast("xpu", *args[1:], **kwargs)
else:
return original_autocast(*args, **kwargs)
original_torch_cat = torch.cat
def torch_cat(tensor, *args, **kwargs):
if len(tensor) == 3 and (
tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype
):
return original_torch_cat(
[tensor[0].to(tensor[1].dtype), tensor[1], tensor[2].to(tensor[1].dtype)],
*args,
**kwargs,
)
else:
return original_torch_cat(tensor, *args, **kwargs)
original_interpolate = torch.nn.functional.interpolate
def interpolate(
tensor,
size=None,
scale_factor=None,
mode="nearest",
align_corners=None,
recompute_scale_factor=None,
antialias=False,
): # pylint: disable=too-many-arguments
if antialias or align_corners is not None:
return_device = tensor.device
return_dtype = tensor.dtype
return original_interpolate(
tensor.to("cpu", dtype=torch.float32),
size=size,
scale_factor=scale_factor,
mode=mode,
align_corners=align_corners,
recompute_scale_factor=recompute_scale_factor,
antialias=antialias,
).to(return_device, dtype=return_dtype)
else:
return original_interpolate(
tensor,
size=size,
scale_factor=scale_factor,
mode=mode,
align_corners=align_corners,
recompute_scale_factor=recompute_scale_factor,
antialias=antialias,
)
original_linalg_solve = torch.linalg.solve
def linalg_solve(A, B, *args, **kwargs): # pylint: disable=invalid-name
if A.device != torch.device("cpu") or B.device != torch.device("cpu"):
return_device = A.device
return original_linalg_solve(A.to("cpu"), B.to("cpu"), *args, **kwargs).to(
return_device
)
else:
return original_linalg_solve(A, B, *args, **kwargs)
def ipex_hijacks():
CondFunc(
"torch.Tensor.to",
lambda orig_func, self, device=None, *args, **kwargs: orig_func(
self, return_xpu(device), *args, **kwargs
),
lambda orig_func, self, device=None, *args, **kwargs: check_device(device),
)
CondFunc(
"torch.Tensor.cuda",
lambda orig_func, self, device=None, *args, **kwargs: orig_func(
self, return_xpu(device), *args, **kwargs
),
lambda orig_func, self, device=None, *args, **kwargs: check_device(device),
)
CondFunc(
"torch.empty",
lambda orig_func, *args, device=None, **kwargs: orig_func(
*args, device=return_xpu(device), **kwargs
),
lambda orig_func, *args, device=None, **kwargs: check_device(device),
)
CondFunc(
"torch.load",
lambda orig_func, *args, map_location=None, **kwargs: orig_func(
*args, return_xpu(map_location), **kwargs
),
lambda orig_func, *args, map_location=None, **kwargs: map_location is None
or check_device(map_location),
)
CondFunc(
"torch.randn",
lambda orig_func, *args, device=None, **kwargs: orig_func(
*args, device=return_xpu(device), **kwargs
),
lambda orig_func, *args, device=None, **kwargs: check_device(device),
)
CondFunc(
"torch.ones",
lambda orig_func, *args, device=None, **kwargs: orig_func(
*args, device=return_xpu(device), **kwargs
),
lambda orig_func, *args, device=None, **kwargs: check_device(device),
)
CondFunc(
"torch.zeros",
lambda orig_func, *args, device=None, **kwargs: orig_func(
*args, device=return_xpu(device), **kwargs
),
lambda orig_func, *args, device=None, **kwargs: check_device(device),
)
CondFunc(
"torch.tensor",
lambda orig_func, *args, device=None, **kwargs: orig_func(
*args, device=return_xpu(device), **kwargs
),
lambda orig_func, *args, device=None, **kwargs: check_device(device),
)
CondFunc(
"torch.linspace",
lambda orig_func, *args, device=None, **kwargs: orig_func(
*args, device=return_xpu(device), **kwargs
),
lambda orig_func, *args, device=None, **kwargs: check_device(device),
)
CondFunc(
"torch.Generator",
lambda orig_func, device=None: torch.xpu.Generator(device),
lambda orig_func, device=None: device is not None
and device != torch.device("cpu")
and device != "cpu",
)
CondFunc(
"torch.batch_norm",
lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(
input,
weight
if weight is not None
else torch.ones(input.size()[1], device=input.device),
bias
if bias is not None
else torch.zeros(input.size()[1], device=input.device),
*args,
**kwargs,
),
lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"),
)
CondFunc(
"torch.instance_norm",
lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(
input,
weight
if weight is not None
else torch.ones(input.size()[1], device=input.device),
bias
if bias is not None
else torch.zeros(input.size()[1], device=input.device),
*args,
**kwargs,
),
lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"),
)
# Functions with dtype errors:
CondFunc(
"torch.nn.modules.GroupNorm.forward",
lambda orig_func, self, input: orig_func(
self, input.to(self.weight.data.dtype)
),
lambda orig_func, self, input: input.dtype != self.weight.data.dtype,
)
CondFunc(
"torch.nn.modules.linear.Linear.forward",
lambda orig_func, self, input: orig_func(
self, input.to(self.weight.data.dtype)
),
lambda orig_func, self, input: input.dtype != self.weight.data.dtype,
)
CondFunc(
"torch.nn.modules.conv.Conv2d.forward",
lambda orig_func, self, input: orig_func(
self, input.to(self.weight.data.dtype)
),
lambda orig_func, self, input: input.dtype != self.weight.data.dtype,
)
CondFunc(
"torch.nn.functional.layer_norm",
lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: orig_func(
input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs
),
lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: weight
is not None
and input.dtype != weight.data.dtype,
)
# Diffusers Float64 (ARC GPUs doesn't support double or Float64):
if not torch.xpu.has_fp64_dtype():
CondFunc(
"torch.from_numpy",
lambda orig_func, ndarray: orig_func(ndarray.astype("float32")),
lambda orig_func, ndarray: ndarray.dtype == float,
)
# Broken functions when torch.cuda.is_available is True:
CondFunc(
"torch.utils.data.dataloader._BaseDataLoaderIter.__init__",
lambda orig_func, *args, **kwargs: ipex_no_cuda(orig_func, *args, **kwargs),
lambda orig_func, *args, **kwargs: True,
)
# Functions that make compile mad with CondFunc:
torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = (
_shutdown_workers
)
torch.nn.DataParallel = DummyDataParallel
torch.autocast = ipex_autocast
torch.cat = torch_cat
torch.linalg.solve = linalg_solve
torch.nn.functional.interpolate = interpolate
torch.backends.cuda.sdp_kernel = return_null_context

@ -0,0 +1,52 @@
import torch
from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
def export_onnx(ModelPath, ExportedPath):
cpt = torch.load(ModelPath, map_location="cpu")
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
test_phone = torch.rand(1, 200, vec_channels) # hidden unit
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
test_pitchf = torch.rand(1, 200) # nsf基频
test_ds = torch.LongTensor([0]) # 说话人ID
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
device = "cpu" # 导出时设备(不影响使用模型)
net_g = SynthesizerTrnMsNSFsidM(
*cpt["config"], is_half=False, version=cpt.get("version", "v1")
) # fp32导出C++要支持fp16必须手动将内存重新排列所以暂时不用fp16
net_g.load_state_dict(cpt["weight"], strict=False)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
output_names = [
"audio",
]
# net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
torch.onnx.export(
net_g,
(
test_phone.to(device),
test_phone_lengths.to(device),
test_pitch.to(device),
test_pitchf.to(device),
test_ds.to(device),
test_rnd.to(device),
),
ExportedPath,
dynamic_axes={
"phone": [1],
"pitch": [1],
"pitchf": [1],
"rnd": [2],
},
do_constant_folding=False,
opset_version=13,
verbose=False,
input_names=input_names,
output_names=output_names,
)
return "Finished"

@ -0,0 +1,175 @@
import os
import sys
import traceback
import parselmouth
now_dir = os.getcwd()
sys.path.append(now_dir)
import logging
import numpy as np
import pyworld
from infer.lib.audio import load_audio
logging.getLogger("numba").setLevel(logging.WARNING)
from multiprocessing import Process
exp_dir = sys.argv[1]
f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
def printt(strr):
print(strr)
f.write("%s\n" % strr)
f.flush()
n_p = int(sys.argv[2])
f0method = sys.argv[3]
class FeatureInput(object):
def __init__(self, samplerate=16000, hop_size=160):
self.fs = samplerate
self.hop = hop_size
self.f0_bin = 256
self.f0_max = 1100.0
self.f0_min = 50.0
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
def compute_f0(self, path, f0_method):
x = load_audio(path, self.fs)
p_len = x.shape[0] // self.hop
if f0_method == "pm":
time_step = 160 / 16000 * 1000
f0_min = 50
f0_max = 1100
f0 = (
parselmouth.Sound(x, self.fs)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest":
f0, t = pyworld.harvest(
x.astype(np.double),
fs=self.fs,
f0_ceil=self.f0_max,
f0_floor=self.f0_min,
frame_period=1000 * self.hop / self.fs,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
elif f0_method == "dio":
f0, t = pyworld.dio(
x.astype(np.double),
fs=self.fs,
f0_ceil=self.f0_max,
f0_floor=self.f0_min,
frame_period=1000 * self.hop / self.fs,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
elif f0_method == "rmvpe":
if hasattr(self, "model_rmvpe") == False:
from infer.lib.rmvpe import RMVPE
print("Loading rmvpe model")
self.model_rmvpe = RMVPE(
"assets/rmvpe/rmvpe.pt", is_half=False, device="cpu"
)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
return f0
def coarse_f0(self, f0):
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
self.f0_bin - 2
) / (self.f0_mel_max - self.f0_mel_min) + 1
# use 0 or 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
f0_coarse = np.rint(f0_mel).astype(int)
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
f0_coarse.max(),
f0_coarse.min(),
)
return f0_coarse
def go(self, paths, f0_method):
if len(paths) == 0:
printt("no-f0-todo")
else:
printt("todo-f0-%s" % len(paths))
n = max(len(paths) // 5, 1) # 每个进程最多打印5条
for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
try:
if idx % n == 0:
printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
if (
os.path.exists(opt_path1 + ".npy") == True
and os.path.exists(opt_path2 + ".npy") == True
):
continue
featur_pit = self.compute_f0(inp_path, f0_method)
np.save(
opt_path2,
featur_pit,
allow_pickle=False,
) # nsf
coarse_pit = self.coarse_f0(featur_pit)
np.save(
opt_path1,
coarse_pit,
allow_pickle=False,
) # ori
except:
printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
if __name__ == "__main__":
# exp_dir=r"E:\codes\py39\dataset\mi-test"
# n_p=16
# f = open("%s/log_extract_f0.log"%exp_dir, "w")
printt(sys.argv)
featureInput = FeatureInput()
paths = []
inp_root = "%s/1_16k_wavs" % (exp_dir)
opt_root1 = "%s/2a_f0" % (exp_dir)
opt_root2 = "%s/2b-f0nsf" % (exp_dir)
os.makedirs(opt_root1, exist_ok=True)
os.makedirs(opt_root2, exist_ok=True)
for name in sorted(list(os.listdir(inp_root))):
inp_path = "%s/%s" % (inp_root, name)
if "spec" in inp_path:
continue
opt_path1 = "%s/%s" % (opt_root1, name)
opt_path2 = "%s/%s" % (opt_root2, name)
paths.append([inp_path, opt_path1, opt_path2])
ps = []
for i in range(n_p):
p = Process(
target=featureInput.go,
args=(
paths[i::n_p],
f0method,
),
)
ps.append(p)
p.start()
for i in range(n_p):
ps[i].join()

@ -0,0 +1,141 @@
import os
import sys
import traceback
import parselmouth
now_dir = os.getcwd()
sys.path.append(now_dir)
import logging
import numpy as np
import pyworld
from infer.lib.audio import load_audio
logging.getLogger("numba").setLevel(logging.WARNING)
n_part = int(sys.argv[1])
i_part = int(sys.argv[2])
i_gpu = sys.argv[3]
os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
exp_dir = sys.argv[4]
is_half = sys.argv[5]
f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
def printt(strr):
print(strr)
f.write("%s\n" % strr)
f.flush()
class FeatureInput(object):
def __init__(self, samplerate=16000, hop_size=160):
self.fs = samplerate
self.hop = hop_size
self.f0_bin = 256
self.f0_max = 1100.0
self.f0_min = 50.0
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
def compute_f0(self, path, f0_method):
x = load_audio(path, self.fs)
# p_len = x.shape[0] // self.hop
if f0_method == "rmvpe":
if hasattr(self, "model_rmvpe") == False:
from infer.lib.rmvpe import RMVPE
print("Loading rmvpe model")
self.model_rmvpe = RMVPE(
"assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda"
)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
return f0
def coarse_f0(self, f0):
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
self.f0_bin - 2
) / (self.f0_mel_max - self.f0_mel_min) + 1
# use 0 or 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
f0_coarse = np.rint(f0_mel).astype(int)
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
f0_coarse.max(),
f0_coarse.min(),
)
return f0_coarse
def go(self, paths, f0_method):
if len(paths) == 0:
printt("no-f0-todo")
else:
printt("todo-f0-%s" % len(paths))
n = max(len(paths) // 5, 1) # 每个进程最多打印5条
for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
try:
if idx % n == 0:
printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
if (
os.path.exists(opt_path1 + ".npy") == True
and os.path.exists(opt_path2 + ".npy") == True
):
continue
featur_pit = self.compute_f0(inp_path, f0_method)
np.save(
opt_path2,
featur_pit,
allow_pickle=False,
) # nsf
coarse_pit = self.coarse_f0(featur_pit)
np.save(
opt_path1,
coarse_pit,
allow_pickle=False,
) # ori
except:
printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
if __name__ == "__main__":
# exp_dir=r"E:\codes\py39\dataset\mi-test"
# n_p=16
# f = open("%s/log_extract_f0.log"%exp_dir, "w")
printt(sys.argv)
featureInput = FeatureInput()
paths = []
inp_root = "%s/1_16k_wavs" % (exp_dir)
opt_root1 = "%s/2a_f0" % (exp_dir)
opt_root2 = "%s/2b-f0nsf" % (exp_dir)
os.makedirs(opt_root1, exist_ok=True)
os.makedirs(opt_root2, exist_ok=True)
for name in sorted(list(os.listdir(inp_root))):
inp_path = "%s/%s" % (inp_root, name)
if "spec" in inp_path:
continue
opt_path1 = "%s/%s" % (opt_root1, name)
opt_path2 = "%s/%s" % (opt_root2, name)
paths.append([inp_path, opt_path1, opt_path2])
try:
featureInput.go(paths[i_part::n_part], "rmvpe")
except:
printt("f0_all_fail-%s" % (traceback.format_exc()))
# ps = []
# for i in range(n_p):
# p = Process(
# target=featureInput.go,
# args=(
# paths[i::n_p],
# f0method,
# ),
# )
# ps.append(p)
# p.start()
# for i in range(n_p):
# ps[i].join()

@ -0,0 +1,139 @@
import os
import sys
import traceback
import parselmouth
now_dir = os.getcwd()
sys.path.append(now_dir)
import logging
import numpy as np
import pyworld
from infer.lib.audio import load_audio
logging.getLogger("numba").setLevel(logging.WARNING)
exp_dir = sys.argv[1]
import torch_directml
device = torch_directml.device(torch_directml.default_device())
f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
def printt(strr):
print(strr)
f.write("%s\n" % strr)
f.flush()
class FeatureInput(object):
def __init__(self, samplerate=16000, hop_size=160):
self.fs = samplerate
self.hop = hop_size
self.f0_bin = 256
self.f0_max = 1100.0
self.f0_min = 50.0
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
def compute_f0(self, path, f0_method):
x = load_audio(path, self.fs)
# p_len = x.shape[0] // self.hop
if f0_method == "rmvpe":
if hasattr(self, "model_rmvpe") == False:
from infer.lib.rmvpe import RMVPE
print("Loading rmvpe model")
self.model_rmvpe = RMVPE(
"assets/rmvpe/rmvpe.pt", is_half=False, device=device
)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
return f0
def coarse_f0(self, f0):
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
self.f0_bin - 2
) / (self.f0_mel_max - self.f0_mel_min) + 1
# use 0 or 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
f0_coarse = np.rint(f0_mel).astype(int)
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
f0_coarse.max(),
f0_coarse.min(),
)
return f0_coarse
def go(self, paths, f0_method):
if len(paths) == 0:
printt("no-f0-todo")
else:
printt("todo-f0-%s" % len(paths))
n = max(len(paths) // 5, 1) # 每个进程最多打印5条
for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
try:
if idx % n == 0:
printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
if (
os.path.exists(opt_path1 + ".npy") == True
and os.path.exists(opt_path2 + ".npy") == True
):
continue
featur_pit = self.compute_f0(inp_path, f0_method)
np.save(
opt_path2,
featur_pit,
allow_pickle=False,
) # nsf
coarse_pit = self.coarse_f0(featur_pit)
np.save(
opt_path1,
coarse_pit,
allow_pickle=False,
) # ori
except:
printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
if __name__ == "__main__":
# exp_dir=r"E:\codes\py39\dataset\mi-test"
# n_p=16
# f = open("%s/log_extract_f0.log"%exp_dir, "w")
printt(sys.argv)
featureInput = FeatureInput()
paths = []
inp_root = "%s/1_16k_wavs" % (exp_dir)
opt_root1 = "%s/2a_f0" % (exp_dir)
opt_root2 = "%s/2b-f0nsf" % (exp_dir)
os.makedirs(opt_root1, exist_ok=True)
os.makedirs(opt_root2, exist_ok=True)
for name in sorted(list(os.listdir(inp_root))):
inp_path = "%s/%s" % (inp_root, name)
if "spec" in inp_path:
continue
opt_path1 = "%s/%s" % (opt_root1, name)
opt_path2 = "%s/%s" % (opt_root2, name)
paths.append([inp_path, opt_path1, opt_path2])
try:
featureInput.go(paths, "rmvpe")
except:
printt("f0_all_fail-%s" % (traceback.format_exc()))
# ps = []
# for i in range(n_p):
# p = Process(
# target=featureInput.go,
# args=(
# paths[i::n_p],
# f0method,
# ),
# )
# ps.append(p)
# p.start()
# for i in range(n_p):
# ps[i].join()

@ -0,0 +1,137 @@
import os
import sys
import traceback
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
device = sys.argv[1]
n_part = int(sys.argv[2])
i_part = int(sys.argv[3])
if len(sys.argv) == 6:
exp_dir = sys.argv[4]
version = sys.argv[5]
else:
i_gpu = sys.argv[4]
exp_dir = sys.argv[5]
os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
version = sys.argv[6]
import fairseq
import numpy as np
import soundfile as sf
import torch
import torch.nn.functional as F
if "privateuseone" not in device:
device = "cpu"
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
else:
import torch_directml
device = torch_directml.device(torch_directml.default_device())
def forward_dml(ctx, x, scale):
ctx.scale = scale
res = x.clone().detach()
return res
fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
def printt(strr):
print(strr)
f.write("%s\n" % strr)
f.flush()
printt(sys.argv)
model_path = "assets/hubert/hubert_base.pt"
printt(exp_dir)
wavPath = "%s/1_16k_wavs" % exp_dir
outPath = (
"%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir
)
os.makedirs(outPath, exist_ok=True)
# wave must be 16k, hop_size=320
def readwave(wav_path, normalize=False):
wav, sr = sf.read(wav_path)
assert sr == 16000
feats = torch.from_numpy(wav).float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
if normalize:
with torch.no_grad():
feats = F.layer_norm(feats, feats.shape)
feats = feats.view(1, -1)
return feats
# HuBERT model
printt("load model(s) from {}".format(model_path))
# if hubert model is exist
if os.access(model_path, os.F_OK) == False:
printt(
"Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main"
% model_path
)
exit(0)
models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
[model_path],
suffix="",
)
model = models[0]
model = model.to(device)
printt("move model to %s" % device)
if device not in ["mps", "cpu"]:
model = model.half()
model.eval()
todo = sorted(list(os.listdir(wavPath)))[i_part::n_part]
n = max(1, len(todo) // 10) # 最多打印十条
if len(todo) == 0:
printt("no-feature-todo")
else:
printt("all-feature-%s" % len(todo))
for idx, file in enumerate(todo):
try:
if file.endswith(".wav"):
wav_path = "%s/%s" % (wavPath, file)
out_path = "%s/%s" % (outPath, file.replace("wav", "npy"))
if os.path.exists(out_path):
continue
feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = {
"source": feats.half().to(device)
if device not in ["mps", "cpu"]
else feats.to(device),
"padding_mask": padding_mask.to(device),
"output_layer": 9 if version == "v1" else 12, # layer 9
}
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = (
model.final_proj(logits[0]) if version == "v1" else logits[0]
)
feats = feats.squeeze(0).float().cpu().numpy()
if np.isnan(feats).sum() == 0:
np.save(out_path, feats, allow_pickle=False)
else:
printt("%s-contains nan" % file)
if idx % n == 0:
printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape))
except:
printt(traceback.format_exc())
printt("all-feature-done")

@ -0,0 +1,147 @@
import multiprocessing
import os
import sys
from scipy import signal
now_dir = os.getcwd()
sys.path.append(now_dir)
print(sys.argv)
inp_root = sys.argv[1]
sr = int(sys.argv[2])
n_p = int(sys.argv[3])
exp_dir = sys.argv[4]
noparallel = sys.argv[5] == "True"
per = float(sys.argv[6])
import multiprocessing
import os
import traceback
import librosa
import numpy as np
from scipy.io import wavfile
from infer.lib.audio import load_audio
from infer.lib.slicer2 import Slicer
mutex = multiprocessing.Lock()
f = open("%s/preprocess.log" % exp_dir, "a+")
def println(strr):
mutex.acquire()
print(strr)
f.write("%s\n" % strr)
f.flush()
mutex.release()
class PreProcess:
def __init__(self, sr, exp_dir, per=3.7):
self.slicer = Slicer(
sr=sr,
threshold=-42,
min_length=1500,
min_interval=400,
hop_size=15,
max_sil_kept=500,
)
self.sr = sr
self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
self.per = per
self.overlap = 0.3
self.tail = self.per + self.overlap
self.max = 0.9
self.alpha = 0.75
self.exp_dir = exp_dir
self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
os.makedirs(self.exp_dir, exist_ok=True)
os.makedirs(self.gt_wavs_dir, exist_ok=True)
os.makedirs(self.wavs16k_dir, exist_ok=True)
def norm_write(self, tmp_audio, idx0, idx1):
tmp_max = np.abs(tmp_audio).max()
if tmp_max > 2.5:
print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
return
tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
1 - self.alpha
) * tmp_audio
wavfile.write(
"%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
self.sr,
tmp_audio.astype(np.float32),
)
tmp_audio = librosa.resample(
tmp_audio, orig_sr=self.sr, target_sr=16000
) # , res_type="soxr_vhq"
wavfile.write(
"%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
16000,
tmp_audio.astype(np.float32),
)
def pipeline(self, path, idx0):
try:
audio = load_audio(path, self.sr)
# zero phased digital filter cause pre-ringing noise...
# audio = signal.filtfilt(self.bh, self.ah, audio)
audio = signal.lfilter(self.bh, self.ah, audio)
idx1 = 0
for audio in self.slicer.slice(audio):
i = 0
while 1:
start = int(self.sr * (self.per - self.overlap) * i)
i += 1
if len(audio[start:]) > self.tail * self.sr:
tmp_audio = audio[start : start + int(self.per * self.sr)]
self.norm_write(tmp_audio, idx0, idx1)
idx1 += 1
else:
tmp_audio = audio[start:]
idx1 += 1
break
self.norm_write(tmp_audio, idx0, idx1)
println("%s->Suc." % path)
except:
println("%s->%s" % (path, traceback.format_exc()))
def pipeline_mp(self, infos):
for path, idx0 in infos:
self.pipeline(path, idx0)
def pipeline_mp_inp_dir(self, inp_root, n_p):
try:
infos = [
("%s/%s" % (inp_root, name), idx)
for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
]
if noparallel:
for i in range(n_p):
self.pipeline_mp(infos[i::n_p])
else:
ps = []
for i in range(n_p):
p = multiprocessing.Process(
target=self.pipeline_mp, args=(infos[i::n_p],)
)
ps.append(p)
p.start()
for i in range(n_p):
ps[i].join()
except:
println("Fail. %s" % traceback.format_exc())
def preprocess_trainset(inp_root, sr, n_p, exp_dir, per):
pp = PreProcess(sr, exp_dir, per)
println("start preprocess")
println(sys.argv)
pp.pipeline_mp_inp_dir(inp_root, n_p)
println("end preprocess")
if __name__ == "__main__":
preprocess_trainset(inp_root, sr, n_p, exp_dir, per)

@ -0,0 +1,643 @@
import os
import sys
import logging
logger = logging.getLogger(__name__)
now_dir = os.getcwd()
sys.path.append(os.path.join(now_dir))
import datetime
from infer.lib.train import utils
hps = utils.get_hparams()
os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",")
n_gpus = len(hps.gpus.split("-"))
from random import randint, shuffle
import torch
try:
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
if torch.xpu.is_available():
from infer.modules.ipex import ipex_init
from infer.modules.ipex.gradscaler import gradscaler_init
from torch.xpu.amp import autocast
GradScaler = gradscaler_init()
ipex_init()
else:
from torch.cuda.amp import GradScaler, autocast
except Exception:
from torch.cuda.amp import GradScaler, autocast
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
from time import sleep
from time import time as ttime
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn import functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from infer.lib.infer_pack import commons
from infer.lib.train.data_utils import (
DistributedBucketSampler,
TextAudioCollate,
TextAudioCollateMultiNSFsid,
TextAudioLoader,
TextAudioLoaderMultiNSFsid,
)
if hps.version == "v1":
from infer.lib.infer_pack.models import MultiPeriodDiscriminator
from infer.lib.infer_pack.models import SynthesizerTrnMs256NSFsid as RVC_Model_f0
from infer.lib.infer_pack.models import (
SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0,
)
else:
from infer.lib.infer_pack.models import (
SynthesizerTrnMs768NSFsid as RVC_Model_f0,
SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0,
MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator,
)
from infer.lib.train.losses import (
discriminator_loss,
feature_loss,
generator_loss,
kl_loss,
)
from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
from infer.lib.train.process_ckpt import savee
global_step = 0
class EpochRecorder:
def __init__(self):
self.last_time = ttime()
def record(self):
now_time = ttime()
elapsed_time = now_time - self.last_time
self.last_time = now_time
elapsed_time_str = str(datetime.timedelta(seconds=elapsed_time))
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return f"[{current_time}] | ({elapsed_time_str})"
def main():
n_gpus = torch.cuda.device_count()
if torch.cuda.is_available() == False and torch.backends.mps.is_available() == True:
n_gpus = 1
if n_gpus < 1:
# patch to unblock people without gpus. there is probably a better way.
print("NO GPU DETECTED: falling back to CPU - this may take a while")
n_gpus = 1
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
children = []
for i in range(n_gpus):
subproc = mp.Process(
target=run,
args=(i, n_gpus, hps),
)
children.append(subproc)
subproc.start()
for i in range(n_gpus):
children[i].join()
def run(
rank,
n_gpus,
hps,
):
global global_step
if rank == 0:
logger = utils.get_logger(hps.model_dir)
logger.info(hps)
# utils.check_git_hash(hps.model_dir)
writer = SummaryWriter(log_dir=hps.model_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
dist.init_process_group(
backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
)
torch.manual_seed(hps.train.seed)
if torch.cuda.is_available():
torch.cuda.set_device(rank)
if hps.if_f0 == 1:
train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
else:
train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
train_sampler = DistributedBucketSampler(
train_dataset,
hps.train.batch_size * n_gpus,
# [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s
[100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s
num_replicas=n_gpus,
rank=rank,
shuffle=True,
)
# It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
# num_workers=8 -> num_workers=4
if hps.if_f0 == 1:
collate_fn = TextAudioCollateMultiNSFsid()
else:
collate_fn = TextAudioCollate()
train_loader = DataLoader(
train_dataset,
num_workers=4,
shuffle=False,
pin_memory=True,
collate_fn=collate_fn,
batch_sampler=train_sampler,
persistent_workers=True,
prefetch_factor=8,
)
if hps.if_f0 == 1:
net_g = RVC_Model_f0(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model,
is_half=hps.train.fp16_run,
sr=hps.sample_rate,
)
else:
net_g = RVC_Model_nof0(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model,
is_half=hps.train.fp16_run,
)
if torch.cuda.is_available():
net_g = net_g.cuda(rank)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm)
if torch.cuda.is_available():
net_d = net_d.cuda(rank)
optim_g = torch.optim.AdamW(
net_g.parameters(),
hps.train.learning_rate,
betas=hps.train.betas,
eps=hps.train.eps,
)
optim_d = torch.optim.AdamW(
net_d.parameters(),
hps.train.learning_rate,
betas=hps.train.betas,
eps=hps.train.eps,
)
# net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
# net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
if hasattr(torch, "xpu") and torch.xpu.is_available():
pass
elif torch.cuda.is_available():
net_g = DDP(net_g, device_ids=[rank])
net_d = DDP(net_d, device_ids=[rank])
else:
net_g = DDP(net_g)
net_d = DDP(net_d)
try: # 如果能加载自动resume
_, _, _, epoch_str = utils.load_checkpoint(
utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d
) # D多半加载没事
if rank == 0:
logger.info("loaded D")
# _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
_, _, _, epoch_str = utils.load_checkpoint(
utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g
)
global_step = (epoch_str - 1) * len(train_loader)
# epoch_str = 1
# global_step = 0
except: # 如果首次不能加载加载pretrain
# traceback.print_exc()
epoch_str = 1
global_step = 0
if hps.pretrainG != "":
if rank == 0:
logger.info("loaded pretrained %s" % (hps.pretrainG))
if hasattr(net_g, "module"):
logger.info(
net_g.module.load_state_dict(
torch.load(hps.pretrainG, map_location="cpu")["model"]
)
) ##测试不加载优化器
else:
logger.info(
net_g.load_state_dict(
torch.load(hps.pretrainG, map_location="cpu")["model"]
)
) ##测试不加载优化器
if hps.pretrainD != "":
if rank == 0:
logger.info("loaded pretrained %s" % (hps.pretrainD))
if hasattr(net_d, "module"):
logger.info(
net_d.module.load_state_dict(
torch.load(hps.pretrainD, map_location="cpu")["model"]
)
)
else:
logger.info(
net_d.load_state_dict(
torch.load(hps.pretrainD, map_location="cpu")["model"]
)
)
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
)
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
)
scaler = GradScaler(enabled=hps.train.fp16_run)
cache = []
for epoch in range(epoch_str, hps.train.epochs + 1):
if rank == 0:
train_and_evaluate(
rank,
epoch,
hps,
[net_g, net_d],
[optim_g, optim_d],
[scheduler_g, scheduler_d],
scaler,
[train_loader, None],
logger,
[writer, writer_eval],
cache,
)
else:
train_and_evaluate(
rank,
epoch,
hps,
[net_g, net_d],
[optim_g, optim_d],
[scheduler_g, scheduler_d],
scaler,
[train_loader, None],
None,
None,
cache,
)
scheduler_g.step()
scheduler_d.step()
def train_and_evaluate(
rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers, cache
):
net_g, net_d = nets
optim_g, optim_d = optims
train_loader, eval_loader = loaders
if writers is not None:
writer, writer_eval = writers
train_loader.batch_sampler.set_epoch(epoch)
global global_step
net_g.train()
net_d.train()
# Prepare data iterator
if hps.if_cache_data_in_gpu == True:
# Use Cache
data_iterator = cache
if cache == []:
# Make new cache
for batch_idx, info in enumerate(train_loader):
# Unpack
if hps.if_f0 == 1:
(
phone,
phone_lengths,
pitch,
pitchf,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
) = info
else:
(
phone,
phone_lengths,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
) = info
# Load on CUDA
if torch.cuda.is_available():
phone = phone.cuda(rank, non_blocking=True)
phone_lengths = phone_lengths.cuda(rank, non_blocking=True)
if hps.if_f0 == 1:
pitch = pitch.cuda(rank, non_blocking=True)
pitchf = pitchf.cuda(rank, non_blocking=True)
sid = sid.cuda(rank, non_blocking=True)
spec = spec.cuda(rank, non_blocking=True)
spec_lengths = spec_lengths.cuda(rank, non_blocking=True)
wave = wave.cuda(rank, non_blocking=True)
wave_lengths = wave_lengths.cuda(rank, non_blocking=True)
# Cache on list
if hps.if_f0 == 1:
cache.append(
(
batch_idx,
(
phone,
phone_lengths,
pitch,
pitchf,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
),
)
)
else:
cache.append(
(
batch_idx,
(
phone,
phone_lengths,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
),
)
)
else:
# Load shuffled cache
shuffle(cache)
else:
# Loader
data_iterator = enumerate(train_loader)
# Run steps
epoch_recorder = EpochRecorder()
for batch_idx, info in data_iterator:
# Data
## Unpack
if hps.if_f0 == 1:
(
phone,
phone_lengths,
pitch,
pitchf,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
) = info
else:
phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
## Load on CUDA
if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available():
phone = phone.cuda(rank, non_blocking=True)
phone_lengths = phone_lengths.cuda(rank, non_blocking=True)
if hps.if_f0 == 1:
pitch = pitch.cuda(rank, non_blocking=True)
pitchf = pitchf.cuda(rank, non_blocking=True)
sid = sid.cuda(rank, non_blocking=True)
spec = spec.cuda(rank, non_blocking=True)
spec_lengths = spec_lengths.cuda(rank, non_blocking=True)
wave = wave.cuda(rank, non_blocking=True)
# wave_lengths = wave_lengths.cuda(rank, non_blocking=True)
# Calculate
with autocast(enabled=hps.train.fp16_run):
if hps.if_f0 == 1:
(
y_hat,
ids_slice,
x_mask,
z_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
) = net_g(phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid)
else:
(
y_hat,
ids_slice,
x_mask,
z_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
mel = spec_to_mel_torch(
spec,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.mel_fmin,
hps.data.mel_fmax,
)
y_mel = commons.slice_segments(
mel, ids_slice, hps.train.segment_size // hps.data.hop_length
)
with autocast(enabled=False):
y_hat_mel = mel_spectrogram_torch(
y_hat.float().squeeze(1),
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax,
)
if hps.train.fp16_run == True:
y_hat_mel = y_hat_mel.half()
wave = commons.slice_segments(
wave, ids_slice * hps.data.hop_length, hps.train.segment_size
) # slice
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
with autocast(enabled=False):
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
y_d_hat_r, y_d_hat_g
)
optim_d.zero_grad()
scaler.scale(loss_disc).backward()
scaler.unscale_(optim_d)
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
scaler.step(optim_d)
with autocast(enabled=hps.train.fp16_run):
# Generator
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
with autocast(enabled=False):
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
loss_fm = feature_loss(fmap_r, fmap_g)
loss_gen, losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()
scaler.unscale_(optim_g)
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]["lr"]
logger.info(
"Train Epoch: {} [{:.0f}%]".format(
epoch, 100.0 * batch_idx / len(train_loader)
)
)
# Amor For Tensorboard display
if loss_mel > 75:
loss_mel = 75
if loss_kl > 9:
loss_kl = 9
logger.info([global_step, lr])
logger.info(
f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
)
scalar_dict = {
"loss/g/total": loss_gen_all,
"loss/d/total": loss_disc,
"learning_rate": lr,
"grad_norm_d": grad_norm_d,
"grad_norm_g": grad_norm_g,
}
scalar_dict.update(
{
"loss/g/fm": loss_fm,
"loss/g/mel": loss_mel,
"loss/g/kl": loss_kl,
}
)
scalar_dict.update(
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
)
scalar_dict.update(
{"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
)
scalar_dict.update(
{"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
)
image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy(
y_mel[0].data.cpu().numpy()
),
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
y_hat_mel[0].data.cpu().numpy()
),
"all/mel": utils.plot_spectrogram_to_numpy(
mel[0].data.cpu().numpy()
),
}
utils.summarize(
writer=writer,
global_step=global_step,
images=image_dict,
scalars=scalar_dict,
)
global_step += 1
# /Run steps
if epoch % hps.save_every_epoch == 0 and rank == 0:
if hps.if_latest == 0:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
)
else:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
)
if rank == 0 and hps.save_every_weights == "1":
if hasattr(net_g, "module"):
ckpt = net_g.module.state_dict()
else:
ckpt = net_g.state_dict()
logger.info(
"saving ckpt %s_e%s:%s"
% (
hps.name,
epoch,
savee(
ckpt,
hps.sample_rate,
hps.if_f0,
hps.name + "_e%s_s%s" % (epoch, global_step),
epoch,
hps.version,
hps,
),
)
)
if rank == 0:
logger.info("====> Epoch: {} {}".format(epoch, epoch_recorder.record()))
if epoch >= hps.total_epoch and rank == 0:
logger.info("Training is done. The program is closed.")
if hasattr(net_g, "module"):
ckpt = net_g.module.state_dict()
else:
ckpt = net_g.state_dict()
logger.info(
"saving final ckpt:%s"
% (
savee(
ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps
)
)
)
sleep(1)
os._exit(2333333)
if __name__ == "__main__":
torch.multiprocessing.set_start_method("spawn")
main()

@ -0,0 +1,195 @@
import os
import logging
logger = logging.getLogger(__name__)
import librosa
import numpy as np
import soundfile as sf
import torch
from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets
from infer.lib.uvr5_pack.lib_v5 import spec_utils
from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet
from infer.lib.uvr5_pack.utils import inference
class AudioPre:
def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
"postprocess": False,
"tta": tta,
# Constants
"window_size": 512,
"agg": agg,
"high_end_process": "mirroring",
}
mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk)
model.eval()
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(
self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
):
if ins_root is None and vocal_root is None:
return "No save root."
name = os.path.basename(music_file)
if ins_root is not None:
os.makedirs(ins_root, exist_ok=True)
if vocal_root is not None:
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param["band"])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param["band"][d]
if d == bands_n: # high-end band
(
X_wave[d],
_,
) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug应该上ffmpeg读取但是太麻烦了弃坑
music_file,
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands
X_wave[d] = librosa.core.resample(
X_wave[d + 1],
self.mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
X_wave[d],
bp["hl"],
bp["n_fft"],
self.mp.param["mid_side"],
self.mp.param["mid_side_b2"],
self.mp.param["reverse"],
)
# pdb.set_trace()
if d == bands_n and self.data["high_end_process"] != "none":
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
)
input_high_end = X_spec_s[d][
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data["agg"] / 100)
aggressiveness = {
"value": aggresive_set,
"split_bin": self.mp.param["band"][1]["crop_stop"],
}
with torch.no_grad():
pred, X_mag, X_phase = inference(
X_spec_m, self.device, self.model, aggressiveness, self.data
)
# Postprocess
if self.data["postprocess"]:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
if ins_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
logger.info("%s instruments done" % name)
if is_hp3 == True:
head = "vocal_"
else:
head = "instrument_"
if format in ["wav", "flac"]:
sf.write(
os.path.join(
ins_root,
head + "{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
) #
else:
path = os.path.join(
ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass
if vocal_root is not None:
if is_hp3 == True:
head = "instrument_"
else:
head = "vocal_"
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
logger.info("%s vocals done" % name)
if format in ["wav", "flac"]:
sf.write(
os.path.join(
vocal_root,
head + "{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
else:
path = os.path.join(
vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass

@ -0,0 +1,256 @@
import os
import logging
logger = logging.getLogger(__name__)
import librosa
import numpy as np
import soundfile as sf
import torch
from tqdm import tqdm
cpu = torch.device("cpu")
class ConvTDFNetTrim:
def __init__(
self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024
):
super(ConvTDFNetTrim, self).__init__()
self.dim_f = dim_f
self.dim_t = 2**dim_t
self.n_fft = n_fft
self.hop = hop
self.n_bins = self.n_fft // 2 + 1
self.chunk_size = hop * (self.dim_t - 1)
self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(
device
)
self.target_name = target_name
self.blender = "blender" in model_name
self.dim_c = 4
out_c = self.dim_c * 4 if target_name == "*" else self.dim_c
self.freq_pad = torch.zeros(
[1, out_c, self.n_bins - self.dim_f, self.dim_t]
).to(device)
self.n = L // 2
def stft(self, x):
x = x.reshape([-1, self.chunk_size])
x = torch.stft(
x,
n_fft=self.n_fft,
hop_length=self.hop,
window=self.window,
center=True,
return_complex=True,
)
x = torch.view_as_real(x)
x = x.permute([0, 3, 1, 2])
x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
[-1, self.dim_c, self.n_bins, self.dim_t]
)
return x[:, :, : self.dim_f]
def istft(self, x, freq_pad=None):
freq_pad = (
self.freq_pad.repeat([x.shape[0], 1, 1, 1])
if freq_pad is None
else freq_pad
)
x = torch.cat([x, freq_pad], -2)
c = 4 * 2 if self.target_name == "*" else 2
x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape(
[-1, 2, self.n_bins, self.dim_t]
)
x = x.permute([0, 2, 3, 1])
x = x.contiguous()
x = torch.view_as_complex(x)
x = torch.istft(
x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True
)
return x.reshape([-1, c, self.chunk_size])
def get_models(device, dim_f, dim_t, n_fft):
return ConvTDFNetTrim(
device=device,
model_name="Conv-TDF",
target_name="vocals",
L=11,
dim_f=dim_f,
dim_t=dim_t,
n_fft=n_fft,
)
class Predictor:
def __init__(self, args):
import onnxruntime as ort
logger.info(ort.get_available_providers())
self.args = args
self.model_ = get_models(
device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft
)
self.model = ort.InferenceSession(
os.path.join(args.onnx, self.model_.target_name + ".onnx"),
providers=[
"CUDAExecutionProvider",
"DmlExecutionProvider",
"CPUExecutionProvider",
],
)
logger.info("ONNX load done")
def demix(self, mix):
samples = mix.shape[-1]
margin = self.args.margin
chunk_size = self.args.chunks * 44100
assert not margin == 0, "margin cannot be zero!"
if margin > chunk_size:
margin = chunk_size
segmented_mix = {}
if self.args.chunks == 0 or samples < chunk_size:
chunk_size = samples
counter = -1
for skip in range(0, samples, chunk_size):
counter += 1
s_margin = 0 if counter == 0 else margin
end = min(skip + chunk_size + margin, samples)
start = skip - s_margin
segmented_mix[skip] = mix[:, start:end].copy()
if end == samples:
break
sources = self.demix_base(segmented_mix, margin_size=margin)
"""
mix:(2,big_sample)
segmented_mix:offset->(2,small_sample)
sources:(1,2,big_sample)
"""
return sources
def demix_base(self, mixes, margin_size):
chunked_sources = []
progress_bar = tqdm(total=len(mixes))
progress_bar.set_description("Processing")
for mix in mixes:
cmix = mixes[mix]
sources = []
n_sample = cmix.shape[1]
model = self.model_
trim = model.n_fft // 2
gen_size = model.chunk_size - 2 * trim
pad = gen_size - n_sample % gen_size
mix_p = np.concatenate(
(np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1
)
mix_waves = []
i = 0
while i < n_sample + pad:
waves = np.array(mix_p[:, i : i + model.chunk_size])
mix_waves.append(waves)
i += gen_size
mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu)
with torch.no_grad():
_ort = self.model
spek = model.stft(mix_waves)
if self.args.denoise:
spec_pred = (
-_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5
+ _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5
)
tar_waves = model.istft(torch.tensor(spec_pred))
else:
tar_waves = model.istft(
torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0])
)
tar_signal = (
tar_waves[:, :, trim:-trim]
.transpose(0, 1)
.reshape(2, -1)
.numpy()[:, :-pad]
)
start = 0 if mix == 0 else margin_size
end = None if mix == list(mixes.keys())[::-1][0] else -margin_size
if margin_size == 0:
end = None
sources.append(tar_signal[:, start:end])
progress_bar.update(1)
chunked_sources.append(sources)
_sources = np.concatenate(chunked_sources, axis=-1)
# del self.model
progress_bar.close()
return _sources
def prediction(self, m, vocal_root, others_root, format):
os.makedirs(vocal_root, exist_ok=True)
os.makedirs(others_root, exist_ok=True)
basename = os.path.basename(m)
mix, rate = librosa.load(m, mono=False, sr=44100)
if mix.ndim == 1:
mix = np.asfortranarray([mix, mix])
mix = mix.T
sources = self.demix(mix.T)
opt = sources[0].T
if format in ["wav", "flac"]:
sf.write(
"%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate
)
sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate)
else:
path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename)
path_other = "%s/%s_others.wav" % (others_root, basename)
sf.write(path_vocal, mix - opt, rate)
sf.write(path_other, opt, rate)
opt_path_vocal = path_vocal[:-4] + ".%s" % format
opt_path_other = path_other[:-4] + ".%s" % format
if os.path.exists(path_vocal):
os.system(
"ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal)
)
if os.path.exists(opt_path_vocal):
try:
os.remove(path_vocal)
except:
pass
if os.path.exists(path_other):
os.system(
"ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other)
)
if os.path.exists(opt_path_other):
try:
os.remove(path_other)
except:
pass
class MDXNetDereverb:
def __init__(self, chunks, device):
self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy"
self.shifts = 10 # 'Predict with randomised equivariant stabilisation'
self.mixing = "min_mag" # ['default','min_mag','max_mag']
self.chunks = chunks
self.margin = 44100
self.dim_t = 9
self.dim_f = 3072
self.n_fft = 6144
self.denoise = True
self.pred = Predictor(self)
self.device = device
def _path_audio_(self, input, vocal_root, others_root, format, is_hp3=False):
self.pred.prediction(input, vocal_root, others_root, format)

@ -0,0 +1,108 @@
import os
import traceback
import logging
logger = logging.getLogger(__name__)
import ffmpeg
import torch
from configs.config import Config
from infer.modules.uvr5.mdxnet import MDXNetDereverb
from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho
config = Config()
def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
infos = []
try:
inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
save_root_vocal = (
save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
)
save_root_ins = (
save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
)
if model_name == "onnx_dereverb_By_FoxJoy":
pre_fun = MDXNetDereverb(15, config.device)
else:
func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
pre_fun = func(
agg=int(agg),
model_path=os.path.join(
os.getenv("weight_uvr5_root"), model_name + ".pth"
),
device=config.device,
is_half=config.is_half,
)
is_hp3 = "HP3" in model_name
if inp_root != "":
paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
else:
paths = [path.name for path in paths]
for path in paths:
inp_path = os.path.join(inp_root, path)
need_reformat = 1
done = 0
try:
info = ffmpeg.probe(inp_path, cmd="ffprobe")
if (
info["streams"][0]["channels"] == 2
and info["streams"][0]["sample_rate"] == "44100"
):
need_reformat = 0
pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3
)
done = 1
except:
need_reformat = 1
traceback.print_exc()
if need_reformat == 1:
tmp_path = "%s/%s.reformatted.wav" % (
os.path.join(os.environ["TEMP"]),
os.path.basename(inp_path),
)
os.system(
"ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
% (inp_path, tmp_path)
)
inp_path = tmp_path
try:
if done == 0:
pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0
)
infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos)
except:
try:
if done == 0:
pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0
)
infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos)
except:
infos.append(
"%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
)
yield "\n".join(infos)
except:
infos.append(traceback.format_exc())
yield "\n".join(infos)
finally:
try:
if model_name == "onnx_dereverb_By_FoxJoy":
del pre_fun.pred.model
del pre_fun.pred.model_
else:
del pre_fun.model
del pre_fun
except:
traceback.print_exc()
if torch.cuda.is_available():
torch.cuda.empty_cache()
logger.info("Executed torch.cuda.empty_cache()")
yield "\n".join(infos)

@ -0,0 +1,195 @@
import os
import logging
logger = logging.getLogger(__name__)
import librosa
import numpy as np
import soundfile as sf
import torch
from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets
from infer.lib.uvr5_pack.lib_v5 import spec_utils
from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet
from infer.lib.uvr5_pack.utils import inference
class AudioPre:
def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
"postprocess": False,
"tta": tta,
# Constants
"window_size": 512,
"agg": agg,
"high_end_process": "mirroring",
}
mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk)
model.eval()
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(
self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
):
if ins_root is None and vocal_root is None:
return "No save root."
name = os.path.basename(music_file)
if ins_root is not None:
os.makedirs(ins_root, exist_ok=True)
if vocal_root is not None:
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param["band"])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param["band"][d]
if d == bands_n: # high-end band
(
X_wave[d],
_,
) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug应该上ffmpeg读取但是太麻烦了弃坑
music_file,
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands
X_wave[d] = librosa.core.resample(
X_wave[d + 1],
self.mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
X_wave[d],
bp["hl"],
bp["n_fft"],
self.mp.param["mid_side"],
self.mp.param["mid_side_b2"],
self.mp.param["reverse"],
)
# pdb.set_trace()
if d == bands_n and self.data["high_end_process"] != "none":
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
)
input_high_end = X_spec_s[d][
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data["agg"] / 100)
aggressiveness = {
"value": aggresive_set,
"split_bin": self.mp.param["band"][1]["crop_stop"],
}
with torch.no_grad():
pred, X_mag, X_phase = inference(
X_spec_m, self.device, self.model, aggressiveness, self.data
)
# Postprocess
if self.data["postprocess"]:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
if ins_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
logger.info("%s instruments done" % name)
if is_hp3 == True:
head = "vocal_"
else:
head = "instrument_"
if format in ["wav", "flac"]:
sf.write(
os.path.join(
ins_root,
head + "{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
) #
else:
path = os.path.join(
ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass
if vocal_root is not None:
if is_hp3 == True:
head = "instrument_"
else:
head = "vocal_"
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
logger.info("%s vocals done" % name)
if format in ["wav", "flac"]:
sf.write(
os.path.join(
vocal_root,
head + "{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
else:
path = os.path.join(
vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass

@ -0,0 +1,302 @@
import traceback
import logging
logger = logging.getLogger(__name__)
import numpy as np
import soundfile as sf
import torch
from io import BytesIO
from infer.lib.audio import load_audio, wav2
from infer.lib.infer_pack.models import (
SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
SynthesizerTrnMs768NSFsid,
SynthesizerTrnMs768NSFsid_nono,
)
from infer.modules.vc.pipeline import Pipeline
from infer.modules.vc.utils import *
class VC:
def __init__(self, config):
self.n_spk = None
self.tgt_sr = None
self.net_g = None
self.pipeline = None
self.cpt = None
self.version = None
self.if_f0 = None
self.version = None
self.hubert_model = None
self.config = config
def get_vc(self, sid, *to_return_protect):
logger.info("Get sid: " + sid)
to_return_protect0 = {
"visible": self.if_f0 != 0,
"value": to_return_protect[0]
if self.if_f0 != 0 and to_return_protect
else 0.5,
"__type__": "update",
}
to_return_protect1 = {
"visible": self.if_f0 != 0,
"value": to_return_protect[1]
if self.if_f0 != 0 and to_return_protect
else 0.33,
"__type__": "update",
}
if sid == "" or sid == []:
if self.hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
logger.info("Clean model cache")
del (self.net_g, self.n_spk, self.hubert_model, self.tgt_sr) # ,cpt
self.hubert_model = (
self.net_g
) = self.n_spk = self.hubert_model = self.tgt_sr = None
if torch.cuda.is_available():
torch.cuda.empty_cache()
###楼下不这么折腾清理不干净
self.if_f0 = self.cpt.get("f0", 1)
self.version = self.cpt.get("version", "v1")
if self.version == "v1":
if self.if_f0 == 1:
self.net_g = SynthesizerTrnMs256NSFsid(
*self.cpt["config"], is_half=self.config.is_half
)
else:
self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"])
elif self.version == "v2":
if self.if_f0 == 1:
self.net_g = SynthesizerTrnMs768NSFsid(
*self.cpt["config"], is_half=self.config.is_half
)
else:
self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"])
del self.net_g, self.cpt
if torch.cuda.is_available():
torch.cuda.empty_cache()
return (
{"visible": False, "__type__": "update"},
{
"visible": True,
"value": to_return_protect0,
"__type__": "update",
},
{
"visible": True,
"value": to_return_protect1,
"__type__": "update",
},
"",
"",
)
person = f'{os.getenv("weight_root")}/{sid}'
logger.info(f"Loading: {person}")
self.cpt = torch.load(person, map_location="cpu")
self.tgt_sr = self.cpt["config"][-1]
self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk
self.if_f0 = self.cpt.get("f0", 1)
self.version = self.cpt.get("version", "v1")
synthesizer_class = {
("v1", 1): SynthesizerTrnMs256NSFsid,
("v1", 0): SynthesizerTrnMs256NSFsid_nono,
("v2", 1): SynthesizerTrnMs768NSFsid,
("v2", 0): SynthesizerTrnMs768NSFsid_nono,
}
self.net_g = synthesizer_class.get(
(self.version, self.if_f0), SynthesizerTrnMs256NSFsid
)(*self.cpt["config"], is_half=self.config.is_half)
del self.net_g.enc_q
self.net_g.load_state_dict(self.cpt["weight"], strict=False)
self.net_g.eval().to(self.config.device)
if self.config.is_half:
self.net_g = self.net_g.half()
else:
self.net_g = self.net_g.float()
self.pipeline = Pipeline(self.tgt_sr, self.config)
n_spk = self.cpt["config"][-3]
index = {"value": get_index_path_from_model(sid), "__type__": "update"}
logger.info("Select index: " + index["value"])
return (
(
{"visible": True, "maximum": n_spk, "__type__": "update"},
to_return_protect0,
to_return_protect1,
index,
index,
)
if to_return_protect
else {"visible": True, "maximum": n_spk, "__type__": "update"}
)
def vc_single(
self,
sid,
input_audio_path,
f0_up_key,
f0_file,
f0_method,
file_index,
file_index2,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
):
if input_audio_path is None:
return "You need to upload an audio", None
f0_up_key = int(f0_up_key)
try:
audio = load_audio(input_audio_path, 16000)
audio_max = np.abs(audio).max() / 0.95
if audio_max > 1:
audio /= audio_max
times = [0, 0, 0]
if self.hubert_model is None:
self.hubert_model = load_hubert(self.config)
if file_index:
file_index = (
file_index.strip(" ")
.strip('"')
.strip("\n")
.strip('"')
.strip(" ")
.replace("trained", "added")
)
elif file_index2:
file_index = file_index2
else:
file_index = "" # 防止小白写错,自动帮他替换掉
audio_opt = self.pipeline.pipeline(
self.hubert_model,
self.net_g,
sid,
audio,
input_audio_path,
times,
f0_up_key,
f0_method,
file_index,
index_rate,
self.if_f0,
filter_radius,
self.tgt_sr,
resample_sr,
rms_mix_rate,
self.version,
protect,
f0_file,
)
if self.tgt_sr != resample_sr >= 16000:
tgt_sr = resample_sr
else:
tgt_sr = self.tgt_sr
index_info = (
"Index:\n%s." % file_index
if os.path.exists(file_index)
else "Index not used."
)
return (
"Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs."
% (index_info, *times),
(tgt_sr, audio_opt),
)
except:
info = traceback.format_exc()
logger.warning(info)
return info, (None, None)
def vc_multi(
self,
sid,
dir_path,
opt_root,
paths,
f0_up_key,
f0_method,
file_index,
file_index2,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
format1,
):
try:
dir_path = (
dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) # 防止小白拷路径头尾带了空格和"和回车
opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
os.makedirs(opt_root, exist_ok=True)
try:
if dir_path != "":
paths = [
os.path.join(dir_path, name) for name in os.listdir(dir_path)
]
else:
paths = [path.name for path in paths]
except:
traceback.print_exc()
paths = [path.name for path in paths]
infos = []
for path in paths:
info, opt = self.vc_single(
sid,
path,
f0_up_key,
None,
f0_method,
file_index,
file_index2,
# file_big_npy,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
)
if "Success" in info:
try:
tgt_sr, audio_opt = opt
if format1 in ["wav", "flac"]:
sf.write(
"%s/%s.%s"
% (opt_root, os.path.basename(path), format1),
audio_opt,
tgt_sr,
)
else:
path = "%s/%s.%s" % (
opt_root,
os.path.basename(path),
format1,
)
with BytesIO() as wavf:
sf.write(wavf, audio_opt, tgt_sr, format="wav")
wavf.seek(0, 0)
with open(path, "wb") as outf:
wav2(wavf, outf, format1)
except:
info += traceback.format_exc()
infos.append("%s->%s" % (os.path.basename(path), info))
yield "\n".join(infos)
yield "\n".join(infos)
except:
yield traceback.format_exc()

@ -0,0 +1,457 @@
import os
import sys
import traceback
import logging
logger = logging.getLogger(__name__)
from functools import lru_cache
from time import time as ttime
import faiss
import librosa
import numpy as np
import parselmouth
import pyworld
import torch
import torch.nn.functional as F
import torchcrepe
from scipy import signal
now_dir = os.getcwd()
sys.path.append(now_dir)
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
input_audio_path2wav = {}
@lru_cache
def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
audio = input_audio_path2wav[input_audio_path]
f0, t = pyworld.harvest(
audio,
fs=fs,
f0_ceil=f0max,
f0_floor=f0min,
frame_period=frame_period,
)
f0 = pyworld.stonemask(audio, f0, t, fs)
return f0
def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频2是输出音频,rate是2的占比
# print(data1.max(),data2.max())
rms1 = librosa.feature.rms(
y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
) # 每半秒一个点
rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
rms1 = torch.from_numpy(rms1)
rms1 = F.interpolate(
rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
).squeeze()
rms2 = torch.from_numpy(rms2)
rms2 = F.interpolate(
rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
).squeeze()
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
data2 *= (
torch.pow(rms1, torch.tensor(1 - rate))
* torch.pow(rms2, torch.tensor(rate - 1))
).numpy()
return data2
class Pipeline(object):
def __init__(self, tgt_sr, config):
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
config.x_pad,
config.x_query,
config.x_center,
config.x_max,
config.is_half,
)
self.sr = 16000 # hubert输入采样率
self.window = 160 # 每帧点数
self.t_pad = self.sr * self.x_pad # 每条前后pad时间
self.t_pad_tgt = tgt_sr * self.x_pad
self.t_pad2 = self.t_pad * 2
self.t_query = self.sr * self.x_query # 查询切点前后查询时间
self.t_center = self.sr * self.x_center # 查询切点位置
self.t_max = self.sr * self.x_max # 免查询时长阈值
self.device = config.device
def get_f0(
self,
input_audio_path,
x,
p_len,
f0_up_key,
f0_method,
filter_radius,
inp_f0=None,
):
global input_audio_path2wav
time_step = self.window / self.sr * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if f0_method == "pm":
f0 = (
parselmouth.Sound(x, self.sr)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest":
input_audio_path2wav[input_audio_path] = x.astype(np.double)
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
if filter_radius > 2:
f0 = signal.medfilt(f0, 3)
elif f0_method == "crepe":
model = "full"
# Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512
# Compute pitch using first gpu
audio = torch.tensor(np.copy(x))[None].float()
f0, pd = torchcrepe.predict(
audio,
self.sr,
self.window,
f0_min,
f0_max,
model,
batch_size=batch_size,
device=self.device,
return_periodicity=True,
)
pd = torchcrepe.filter.median(pd, 3)
f0 = torchcrepe.filter.mean(f0, 3)
f0[pd < 0.1] = 0
f0 = f0[0].cpu().numpy()
elif f0_method == "rmvpe":
if not hasattr(self, "model_rmvpe"):
from infer.lib.rmvpe import RMVPE
logger.info(
"Loading rmvpe model,%s" % "%s/rmvpe.pt" % os.environ["rmvpe_root"]
)
self.model_rmvpe = RMVPE(
"%s/rmvpe.pt" % os.environ["rmvpe_root"],
is_half=self.is_half,
device=self.device,
)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
if "privateuseone" in str(self.device): # clean ortruntime memory
del self.model_rmvpe.model
del self.model_rmvpe
logger.info("Cleaning ortruntime memory")
f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0 = self.sr // self.window # 每秒f0点数
if inp_f0 is not None:
delta_t = np.round(
(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
).astype("int16")
replace_f0 = np.interp(
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
)
shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
:shape
]
# with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int32)
return f0_coarse, f0bak # 1-0
def vc(
self,
model,
net_g,
sid,
audio0,
pitch,
pitchf,
times,
index,
big_npy,
index_rate,
version,
protect,
): # ,file_index,file_big_npy
feats = torch.from_numpy(audio0)
if self.is_half:
feats = feats.half()
else:
feats = feats.float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
inputs = {
"source": feats.to(self.device),
"padding_mask": padding_mask,
"output_layer": 9 if version == "v1" else 12,
}
t0 = ttime()
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
if protect < 0.5 and pitch is not None and pitchf is not None:
feats0 = feats.clone()
if (
not isinstance(index, type(None))
and not isinstance(big_npy, type(None))
and index_rate != 0
):
npy = feats[0].cpu().numpy()
if self.is_half:
npy = npy.astype("float32")
# _, I = index.search(npy, 1)
# npy = big_npy[I.squeeze()]
score, ix = index.search(npy, k=8)
weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
if self.is_half:
npy = npy.astype("float16")
feats = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+ (1 - index_rate) * feats
)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if protect < 0.5 and pitch is not None and pitchf is not None:
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
0, 2, 1
)
t1 = ttime()
p_len = audio0.shape[0] // self.window
if feats.shape[1] < p_len:
p_len = feats.shape[1]
if pitch is not None and pitchf is not None:
pitch = pitch[:, :p_len]
pitchf = pitchf[:, :p_len]
if protect < 0.5 and pitch is not None and pitchf is not None:
pitchff = pitchf.clone()
pitchff[pitchf > 0] = 1
pitchff[pitchf < 1] = protect
pitchff = pitchff.unsqueeze(-1)
feats = feats * pitchff + feats0 * (1 - pitchff)
feats = feats.to(feats0.dtype)
p_len = torch.tensor([p_len], device=self.device).long()
with torch.no_grad():
hasp = pitch is not None and pitchf is not None
arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid)
audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy()
del hasp, arg
del feats, p_len, padding_mask
if torch.cuda.is_available():
torch.cuda.empty_cache()
t2 = ttime()
times[0] += t1 - t0
times[2] += t2 - t1
return audio1
def pipeline(
self,
model,
net_g,
sid,
audio,
input_audio_path,
times,
f0_up_key,
f0_method,
file_index,
index_rate,
if_f0,
filter_radius,
tgt_sr,
resample_sr,
rms_mix_rate,
version,
protect,
f0_file=None,
):
if (
file_index != ""
# and file_big_npy != ""
# and os.path.exists(file_big_npy) == True
and os.path.exists(file_index)
and index_rate != 0
):
try:
index = faiss.read_index(file_index)
# big_npy = np.load(file_big_npy)
big_npy = index.reconstruct_n(0, index.ntotal)
except:
traceback.print_exc()
index = big_npy = None
else:
index = big_npy = None
audio = signal.filtfilt(bh, ah, audio)
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
opt_ts = []
if audio_pad.shape[0] > self.t_max:
audio_sum = np.zeros_like(audio)
for i in range(self.window):
audio_sum += np.abs(audio_pad[i : i - self.window])
for t in range(self.t_center, audio.shape[0], self.t_center):
opt_ts.append(
t
- self.t_query
+ np.where(
audio_sum[t - self.t_query : t + self.t_query]
== audio_sum[t - self.t_query : t + self.t_query].min()
)[0][0]
)
s = 0
audio_opt = []
t = None
t1 = ttime()
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
p_len = audio_pad.shape[0] // self.window
inp_f0 = None
if hasattr(f0_file, "name"):
try:
with open(f0_file.name, "r") as f:
lines = f.read().strip("\n").split("\n")
inp_f0 = []
for line in lines:
inp_f0.append([float(i) for i in line.split(",")])
inp_f0 = np.array(inp_f0, dtype="float32")
except:
traceback.print_exc()
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
pitch, pitchf = None, None
if if_f0 == 1:
pitch, pitchf = self.get_f0(
input_audio_path,
audio_pad,
p_len,
f0_up_key,
f0_method,
filter_radius,
inp_f0,
)
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
if "mps" not in str(self.device) or "xpu" not in str(self.device):
pitchf = pitchf.astype(np.float32)
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
t2 = ttime()
times[1] += t2 - t1
for t in opt_ts:
t = t // self.window * self.window
if if_f0 == 1:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[s : t + self.t_pad2 + self.window],
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
else:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[s : t + self.t_pad2 + self.window],
None,
None,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
s = t
if if_f0 == 1:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
pitch[:, t // self.window :] if t is not None else pitch,
pitchf[:, t // self.window :] if t is not None else pitchf,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
else:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
None,
None,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
audio_opt = np.concatenate(audio_opt)
if rms_mix_rate != 1:
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
if tgt_sr != resample_sr >= 16000:
audio_opt = librosa.resample(
audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
)
audio_max = np.abs(audio_opt).max() / 0.99
max_int16 = 32768
if audio_max > 1:
max_int16 /= audio_max
audio_opt = (audio_opt * max_int16).astype(np.int16)
del pitch, pitchf, sid
if torch.cuda.is_available():
torch.cuda.empty_cache()
return audio_opt

@ -0,0 +1,33 @@
import os
from fairseq import checkpoint_utils
def get_index_path_from_model(sid):
return next(
(
f
for f in [
os.path.join(root, name)
for root, _, files in os.walk(os.getenv("index_root"), topdown=False)
for name in files
if name.endswith(".index") and "trained" not in name
]
if sid.split(".")[0] in f
),
"",
)
def load_hubert(config):
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
["assets/hubert/hubert_base.pt"],
suffix="",
)
hubert_model = models[0]
hubert_model = hubert_model.to(config.device)
if config.is_half:
hubert_model = hubert_model.half()
else:
hubert_model = hubert_model.float()
return hubert_model.eval()
Loading…
Cancel
Save