commit 2b4520a5f14ba1dbaae64777defb144276efe2f7 Author: 宇宙遨游 <2746069727@qq.com> Date: Thu Jan 18 17:41:57 2024 +0800 '2' diff --git a/app.py b/app.py new file mode 100644 index 0000000..9bbee91 --- /dev/null +++ b/app.py @@ -0,0 +1,63 @@ +import os +import sys + +import ffmpeg +import torch +import warnings + +#验证SDK token +from modelscope.hub.api import HubApi +api = HubApi() +api.login(os.getenv('token')) + +#模型下载 +from modelscope import snapshot_download +model_dir = snapshot_download('yuzhouaoyou/uvr') + +warnings.filterwarnings("ignore") +os.system('pip install librosa==0.9.1') +os.system('pip install numpy==1.23.5') +#os.system('pip install flask') + +sys.path.append(os.getcwd()) +from infer.modules.uvr5.mdxnet import MDXNetDereverb +from infer.modules.uvr5.vr import AudioPre + +def run_uvr(file): + out_path = 'opt' + func = AudioPre(agg=10,model_path=f'{model_dir}/HP5.pth',device='cuda:0' if torch.cuda.is_available() else 'cpu',is_half=False) + need_reformat = True + try: + info = ffmpeg.probe(file, cmd="ffprobe") + if info["streams"][0]["channels"] == 2 and info["streams"][0]["sample_rate"] == "44100": + need_reformat = False + except: + need_reformat = True + + if need_reformat: + tmp_path = "%s/%s.reformatted.wav" % ( + os.path.join(os.environ["TEMP"]), + os.path.basename(file), + ) + os.system( + "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" + % (file, tmp_path) + ) + file = tmp_path + func._path_audio_(file, out_path, out_path, 'mp3') + return '%s/instrument_%s_10.mp3' % (out_path,os.path.basename(file)), '%s/vocal_%s_10.mp3' % (out_path,os.path.basename(file)) + +import gradio as gr + +def f(file): + i,v = run_uvr(file) + return [i,v] + +demo = gr.Interface( + fn=f, + inputs=[gr.File()], + outputs=[gr.Files()], + title="UVR 人声分离", +) + +demo.launch() \ No newline at end of file diff --git a/infer/lib/audio.py b/infer/lib/audio.py new file mode 100644 index 0000000..fe16245 --- /dev/null +++ b/infer/lib/audio.py @@ -0,0 +1,73 @@ +import os +import traceback + +import librosa +import numpy as np +import av +from io import BytesIO + + +def wav2(i, o, format): + inp = av.open(i, "rb") + if format == "m4a": + format = "mp4" + out = av.open(o, "wb", format=format) + if format == "ogg": + format = "libvorbis" + if format == "mp4": + format = "aac" + + ostream = out.add_stream(format) + + for frame in inp.decode(audio=0): + for p in ostream.encode(frame): + out.mux(p) + + for p in ostream.encode(None): + out.mux(p) + + out.close() + inp.close() + + +def audio2(i, o, format, sr): + inp = av.open(i, "rb") + out = av.open(o, "wb", format=format) + if format == "ogg": + format = "libvorbis" + if format == "f32le": + format = "pcm_f32le" + + ostream = out.add_stream(format, channels=1) + ostream.sample_rate = sr + + for frame in inp.decode(audio=0): + for p in ostream.encode(frame): + out.mux(p) + + out.close() + inp.close() + + +def load_audio(file, sr): + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + if os.path.exists(file) == False: + raise RuntimeError( + "You input a wrong audio path that does not exists, please fix it!" + ) + try: + with open(file, "rb") as f: + with BytesIO() as out: + audio2(f, out, "f32le", sr) + return np.frombuffer(out.getvalue(), np.float32).flatten() + + except AttributeError: + audio = file[1] / 32768.0 + if len(audio.shape) == 2: + audio = np.mean(audio, -1) + return librosa.resample(audio, orig_sr=file[0], target_sr=16000) + + except: + raise RuntimeError(traceback.format_exc()) diff --git a/infer/lib/infer_pack/attentions.py b/infer/lib/infer_pack/attentions.py new file mode 100644 index 0000000..b554253 --- /dev/null +++ b/infer/lib/infer_pack/attentions.py @@ -0,0 +1,459 @@ +import copy +import math +from typing import Optional + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from infer.lib.infer_pack import commons, modules +from infer.lib.infer_pack.modules import LayerNorm + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=10, + **kwargs + ): + super(Encoder, self).__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = int(n_layers) + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + zippep = zip( + self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 + ) + for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep: + y = attn_layers(x, x, attn_mask) + y = self.drop(y) + x = norm_layers_1(x + y) + + y = ffn_layers(x, x_mask) + y = self.drop(y) + x = norm_layers_2(x + y) + x = x * x_mask + return x + + +class Decoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + **kwargs + ): + super(Decoder, self).__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append( + MultiHeadAttention( + hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super(MultiHeadAttention, self).__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward( + self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None + ): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, _ = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s = key.size() + t_t = query.size(2) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length: int): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length: int = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + # commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + [0, 0, pad_length, pad_length, 0, 0], + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad( + x, + # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]) + [0, 1, 0, 0, 0, 0, 0, 0], + ) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, + # commons.convert_pad_shape([[0, 0], [0, 0], [0, int(length) - 1]]) + [0, int(length) - 1, 0, 0, 0, 0], + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, + # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, int(length) - 1]]) + [0, int(length) - 1, 0, 0, 0, 0, 0, 0], + ) + x_flat = x.view([batch, heads, int(length**2) + int(length * (length - 1))]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad( + x_flat, + # commons.convert_pad_shape([[0, 0], [0, 0], [int(length), 0]]) + [length, 0, 0, 0, 0, 0], + ) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length: int): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation: str = None, + causal=False, + ): + super(FFN, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + self.is_activation = True if activation == "gelu" else False + # if causal: + # self.padding = self._causal_padding + # else: + # self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: + if self.causal: + padding = self._causal_padding(x * x_mask) + else: + padding = self._same_padding(x * x_mask) + return padding + + def forward(self, x: torch.Tensor, x_mask: torch.Tensor): + x = self.conv_1(self.padding(x, x_mask)) + if self.is_activation: + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + + x = self.conv_2(self.padding(x, x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l: int = self.kernel_size - 1 + pad_r: int = 0 + # padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad( + x, + # commons.convert_pad_shape(padding) + [pad_l, pad_r, 0, 0, 0, 0], + ) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l: int = (self.kernel_size - 1) // 2 + pad_r: int = self.kernel_size // 2 + # padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad( + x, + # commons.convert_pad_shape(padding) + [pad_l, pad_r, 0, 0, 0, 0], + ) + return x diff --git a/infer/lib/infer_pack/commons.py b/infer/lib/infer_pack/commons.py new file mode 100644 index 0000000..05ea342 --- /dev/null +++ b/infer/lib/infer_pack/commons.py @@ -0,0 +1,172 @@ +from typing import List, Optional +import math + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +# def convert_pad_shape(pad_shape): +# l = pad_shape[::-1] +# pad_shape = [item for sublist in l for item in sublist] +# return pad_shape + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def slice_segments2(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +# def convert_pad_shape(pad_shape): +# l = pad_shape[::-1] +# pad_shape = [item for sublist in l for item in sublist] +# return pad_shape + + +def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]: + return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist() + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2, 3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py new file mode 100644 index 0000000..e489634 --- /dev/null +++ b/infer/lib/infer_pack/models.py @@ -0,0 +1,1439 @@ +import math +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + +import numpy as np +import torch +from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from infer.lib.infer_pack import attentions, commons, modules +from infer.lib.infer_pack.commons import get_padding, init_weights + +has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available()) + + +class TextEncoder256(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super(TextEncoder256, self).__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor + ): + if pitch is None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder768(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super(TextEncoder768, self).__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.emb_phone = nn.Linear(768, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone: torch.Tensor, pitch: torch.Tensor, lengths: torch.Tensor): + if pitch is None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super(ResidualCouplingBlock, self).__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in self.flows[::-1]: + x, _ = flow.forward(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + def __prepare_scriptable__(self): + for i in range(self.n_flows): + for hook in self.flows[i * 2]._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flows[i * 2]) + + return self + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super(PosteriorEncoder, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None + ): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + + for l in self.resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(torch.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + if uv.device.type == "privateuseone": # for DirectML + uv = uv.float() + return uv + + def forward(self, f0: torch.Tensor, upp: int): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in range(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=float(upp), + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=float(upp), mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + # self.ddtype:int = -1 + + def forward(self, x: torch.Tensor, upp: int = 1): + # if self.ddtype ==-1: + # self.ddtype = self.l_linear.weight.dtype + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) + # if self.is_half: + # sine_wavs = sine_wavs.half() + # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) + # print(sine_wavs.dtype,self.ddtype) + # if sine_wavs.dtype != self.l_linear.weight.dtype: + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = math.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = math.prod(upsample_rates) + + self.lrelu_slope = modules.LRELU_SLOPE + + def forward(self, x, f0, g: Optional[torch.Tensor] = None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + # torch.jit.script() does not support direct indexing of torch modules + # That's why I wrote this + for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): + if i < self.num_upsamples: + x = F.leaky_relu(x, self.lrelu_slope) + x = ups(x) + x_source = noise_convs(har_source) + x = x + x_source + xs: Optional[torch.Tensor] = None + l = [i * self.num_kernels + j for j in range(self.num_kernels)] + for j, resblock in enumerate(self.resblocks): + if j in l: + if xs is None: + xs = resblock(x) + else: + xs += resblock(x) + # This assertion cannot be ignored! \ + # If ignored, it will cause torch.jit.script() compilation errors + assert isinstance(xs, torch.Tensor) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.resblocks: + for hook in self.resblocks._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMs256NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super(SynthesizerTrnMs256NSFsid, self).__init__() + if isinstance(sr, str): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + logger.debug( + "gin_channels: " + + str(gin_channels) + + ", self.spk_embed_dim: " + + str(self.spk_embed_dim) + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore + def forward( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor, + pitchf: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + ds: Optional[torch.Tensor] = None, + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor, + nsff0: torch.Tensor, + sid: torch.Tensor, + skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, + ): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if skip_head is not None and return_length is not None: + assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) + head = int(skip_head.item()) + length = int(return_length.item()) + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] + nsff0 = nsff0[:, head : head + length] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super(SynthesizerTrnMs768NSFsid, self).__init__() + if isinstance(sr, str): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + logger.debug( + "gin_channels: " + + str(gin_channels) + + ", self.spk_embed_dim: " + + str(self.spk_embed_dim) + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor, + nsff0: torch.Tensor, + sid: torch.Tensor, + skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, + ): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if skip_head is not None and return_length is not None: + assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) + head = int(skip_head.item()) + length = int(return_length.item()) + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] + nsff0 = nsff0[:, head : head + length] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs256NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super(SynthesizerTrnMs256NSFsid_nono, self).__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + logger.debug( + "gin_channels: " + + str(gin_channels) + + ", self.spk_embed_dim: " + + str(self.spk_embed_dim) + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + sid: torch.Tensor, + skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, + ): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if skip_head is not None and return_length is not None: + assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) + head = int(skip_head.item()) + length = int(return_length.item()) + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super(SynthesizerTrnMs768NSFsid_nono, self).__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + logger.debug( + "gin_channels: " + + str(gin_channels) + + ", self.spk_embed_dim: " + + str(self.spk_embed_dim) + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + sid: torch.Tensor, + skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, + ): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if skip_head is not None and return_length is not None: + assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) + head = int(skip_head.item()) + length = int(return_length.item()) + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + if has_xpu and x.dtype == torch.bfloat16: + x = F.pad(x.to(dtype=torch.float16), (0, n_pad), "reflect").to( + dtype=torch.bfloat16 + ) + else: + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/infer/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py new file mode 100644 index 0000000..97308ef --- /dev/null +++ b/infer/lib/infer_pack/models_onnx.py @@ -0,0 +1,821 @@ +import math +import logging + +logger = logging.getLogger(__name__) + +import numpy as np +import torch +from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from infer.lib.infer_pack import attentions, commons, modules +from infer.lib.infer_pack.commons import get_padding, init_weights + + +class TextEncoder256(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder768(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(768, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x, _ = flow(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = np.prod(upsample_rates) + + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMsNSFsidM(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + version, + **kwargs, + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + if version == "v1": + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + else: + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + self.speaker_map = None + logger.debug( + f"gin_channels: {gin_channels}, self.spk_embed_dim: {self.spk_embed_dim}" + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def construct_spkmixmap(self, n_speaker): + self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels)) + for i in range(n_speaker): + self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) + self.speaker_map = self.speaker_map.unsqueeze(0) + + def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None): + if self.speaker_map is not None: # [N, S] * [S, B, 1, H] + g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] + g = g * self.speaker_map # [N, S, B, 1, H] + g = torch.sum(g, dim=1) # [N, 1, B, 1, H] + g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] + else: + g = g.unsqueeze(0) + g = self.emb_g(g).transpose(1, 2) + + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/infer/lib/infer_pack/modules.py b/infer/lib/infer_pack/modules.py new file mode 100644 index 0000000..47157a9 --- /dev/null +++ b/infer/lib/infer_pack/modules.py @@ -0,0 +1,615 @@ +import copy +import math +from typing import Optional, Tuple + +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, weight_norm + +from infer.lib.infer_pack import commons +from infer.lib.infer_pack.commons import get_padding, init_weights +from infer.lib.infer_pack.transforms import piecewise_rational_quadratic_transform + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super(LayerNorm, self).__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super(ConvReluNorm, self).__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = float(p_dropout) + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(float(p_dropout))) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): + super(DDSConv, self).__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = float(p_dropout) + + self.drop = nn.Dropout(float(p_dropout)) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size**i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append( + nn.Conv1d( + channels, + channels, + kernel_size, + groups=channels, + dilation=dilation, + padding=padding, + ) + ) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g: Optional[torch.Tensor] = None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = float(p_dropout) + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(float(p_dropout)) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward( + self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None + ): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i, (in_layer, res_skip_layer) in enumerate( + zip(self.in_layers, self.res_skip_layers) + ): + x_in = in_layer(x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = res_skip_layer(acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + def __prepare_scriptable__(self): + if self.gin_channels != 0: + for hook in self.cond_layer._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + self.lrelu_slope = LRELU_SLOPE + + def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, self.lrelu_slope) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, self.lrelu_slope) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + def __prepare_scriptable__(self): + for l in self.convs1: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.convs2: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + self.lrelu_slope = LRELU_SLOPE + + def forward(self, x, x_mask: Optional[torch.Tensor] = None): + for c in self.convs: + xt = F.leaky_relu(x, self.lrelu_slope) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + def __prepare_scriptable__(self): + for l in self.convs: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +class Log(nn.Module): + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + # torch.jit.script() Compiled functions \ + # can't take variable number of arguments or \ + # use keyword-only arguments with defaults + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x, torch.zeros([1], device=x.device) + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super(ElementwiseAffine, self).__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels, 1)) + self.logs = nn.Parameter(torch.zeros(channels, 1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super(ResidualCouplingLayer, self).__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=float(p_dropout), + gin_channels=gin_channels, + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x, torch.zeros([1]) + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self + + +class ConvFlow(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + n_layers, + num_bins=10, + tail_bound=5.0, + ): + super(ConvFlow, self).__init__() + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.num_bins = num_bins + self.tail_bound = tail_bound + self.half_channels = in_channels // 2 + + self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) + self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) + self.proj = nn.Conv1d( + filter_channels, self.half_channels * (num_bins * 3 - 1), 1 + ) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse=False, + ): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) + h = self.convs(h, x_mask, g=g) + h = self.proj(h) * x_mask + + b, c, t = x0.shape + h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] + + unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( + self.filter_channels + ) + unnormalized_derivatives = h[..., 2 * self.num_bins :] + + x1, logabsdet = piecewise_rational_quadratic_transform( + x1, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=reverse, + tails="linear", + tail_bound=self.tail_bound, + ) + + x = torch.cat([x0, x1], 1) * x_mask + logdet = torch.sum(logabsdet * x_mask, [1, 2]) + if not reverse: + return x, logdet + else: + return x diff --git a/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py new file mode 100644 index 0000000..55abcfd --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py @@ -0,0 +1,91 @@ +import numpy as np +import pyworld + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor + + +class DioF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def resize_f0(self, x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * target_len, len(source)) / target_len, + np.arange(0, len(source)), + source, + ) + res = np.nan_to_num(target) + return res + + def compute_f0(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] + + def compute_f0_uv(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py new file mode 100644 index 0000000..f56e49e --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py @@ -0,0 +1,16 @@ +class F0Predictor(object): + def compute_f0(self, wav, p_len): + """ + input: wav:[signal_length] + p_len:int + output: f0:[signal_length//hop_length] + """ + pass + + def compute_f0_uv(self, wav, p_len): + """ + input: wav:[signal_length] + p_len:int + output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] + """ + pass diff --git a/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py new file mode 100644 index 0000000..40c28ce --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py @@ -0,0 +1,87 @@ +import numpy as np +import pyworld + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor + + +class HarvestF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def resize_f0(self, x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * target_len, len(source)) / target_len, + np.arange(0, len(source)), + source, + ) + res = np.nan_to_num(target) + return res + + def compute_f0(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.harvest( + wav.astype(np.double), + fs=self.sampling_rate, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] + + def compute_f0_uv(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.harvest( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py new file mode 100644 index 0000000..06f2b79 --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py @@ -0,0 +1,98 @@ +import numpy as np +import parselmouth + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor + + +class PMF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def compute_f0(self, wav, p_len=None): + x = wav + if p_len is None: + p_len = x.shape[0] // self.hop_length + else: + assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = ( + parselmouth.Sound(x, self.sampling_rate) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max, + ) + .selected_array["frequency"] + ) + + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") + f0, uv = self.interpolate_f0(f0) + return f0 + + def compute_f0_uv(self, wav, p_len=None): + x = wav + if p_len is None: + p_len = x.shape[0] // self.hop_length + else: + assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = ( + parselmouth.Sound(x, self.sampling_rate) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max, + ) + .selected_array["frequency"] + ) + + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") + f0, uv = self.interpolate_f0(f0) + return f0, uv diff --git a/infer/lib/infer_pack/modules/F0Predictor/__init__.py b/infer/lib/infer_pack/modules/F0Predictor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/infer/lib/infer_pack/onnx_inference.py b/infer/lib/infer_pack/onnx_inference.py new file mode 100644 index 0000000..6633659 --- /dev/null +++ b/infer/lib/infer_pack/onnx_inference.py @@ -0,0 +1,149 @@ +import librosa +import numpy as np +import onnxruntime +import soundfile + +import logging + +logger = logging.getLogger(__name__) + + +class ContentVec: + def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None): + logger.info("Load model(s) from {}".format(vec_path)) + if device == "cpu" or device is None: + providers = ["CPUExecutionProvider"] + elif device == "cuda": + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + elif device == "dml": + providers = ["DmlExecutionProvider"] + else: + raise RuntimeError("Unsportted Device") + self.model = onnxruntime.InferenceSession(vec_path, providers=providers) + + def __call__(self, wav): + return self.forward(wav) + + def forward(self, wav): + feats = wav + if feats.ndim == 2: # double channels + feats = feats.mean(-1) + assert feats.ndim == 1, feats.ndim + feats = np.expand_dims(np.expand_dims(feats, 0), 0) + onnx_input = {self.model.get_inputs()[0].name: feats} + logits = self.model.run(None, onnx_input)[0] + return logits.transpose(0, 2, 1) + + +def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs): + if f0_predictor == "pm": + from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor + + f0_predictor_object = PMF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + elif f0_predictor == "harvest": + from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import ( + HarvestF0Predictor, + ) + + f0_predictor_object = HarvestF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + elif f0_predictor == "dio": + from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor + + f0_predictor_object = DioF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + else: + raise Exception("Unknown f0 predictor") + return f0_predictor_object + + +class OnnxRVC: + def __init__( + self, + model_path, + sr=40000, + hop_size=512, + vec_path="vec-768-layer-12", + device="cpu", + ): + vec_path = f"pretrained/{vec_path}.onnx" + self.vec_model = ContentVec(vec_path, device) + if device == "cpu" or device is None: + providers = ["CPUExecutionProvider"] + elif device == "cuda": + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + elif device == "dml": + providers = ["DmlExecutionProvider"] + else: + raise RuntimeError("Unsportted Device") + self.model = onnxruntime.InferenceSession(model_path, providers=providers) + self.sampling_rate = sr + self.hop_size = hop_size + + def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd): + onnx_input = { + self.model.get_inputs()[0].name: hubert, + self.model.get_inputs()[1].name: hubert_length, + self.model.get_inputs()[2].name: pitch, + self.model.get_inputs()[3].name: pitchf, + self.model.get_inputs()[4].name: ds, + self.model.get_inputs()[5].name: rnd, + } + return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16) + + def inference( + self, + raw_path, + sid, + f0_method="dio", + f0_up_key=0, + pad_time=0.5, + cr_threshold=0.02, + ): + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + f0_predictor = get_f0_predictor( + f0_method, + hop_length=self.hop_size, + sampling_rate=self.sampling_rate, + threshold=cr_threshold, + ) + wav, sr = librosa.load(raw_path, sr=self.sampling_rate) + org_length = len(wav) + if org_length / sr > 50.0: + raise RuntimeError("Reached Max Length") + + wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000) + wav16k = wav16k + + hubert = self.vec_model(wav16k) + hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32) + hubert_length = hubert.shape[1] + + pitchf = f0_predictor.compute_f0(wav, hubert_length) + pitchf = pitchf * 2 ** (f0_up_key / 12) + pitch = pitchf.copy() + f0_mel = 1127 * np.log(1 + pitch / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + pitch = np.rint(f0_mel).astype(np.int64) + + pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32) + pitch = pitch.reshape(1, len(pitch)) + ds = np.array([sid]).astype(np.int64) + + rnd = np.random.randn(1, 192, hubert_length).astype(np.float32) + hubert_length = np.array([hubert_length]).astype(np.int64) + + out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze() + out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant") + return out_wav[0:org_length] diff --git a/infer/lib/infer_pack/transforms.py b/infer/lib/infer_pack/transforms.py new file mode 100644 index 0000000..6f30b71 --- /dev/null +++ b/infer/lib/infer_pack/transforms.py @@ -0,0 +1,207 @@ +import numpy as np +import torch +from torch.nn import functional as F + +DEFAULT_MIN_BIN_WIDTH = 1e-3 +DEFAULT_MIN_BIN_HEIGHT = 1e-3 +DEFAULT_MIN_DERIVATIVE = 1e-3 + + +def piecewise_rational_quadratic_transform( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails=None, + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if tails is None: + spline_fn = rational_quadratic_spline + spline_kwargs = {} + else: + spline_fn = unconstrained_rational_quadratic_spline + spline_kwargs = {"tails": tails, "tail_bound": tail_bound} + + outputs, logabsdet = spline_fn( + inputs=inputs, + unnormalized_widths=unnormalized_widths, + unnormalized_heights=unnormalized_heights, + unnormalized_derivatives=unnormalized_derivatives, + inverse=inverse, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + **spline_kwargs + ) + return outputs, logabsdet + + +def searchsorted(bin_locations, inputs, eps=1e-6): + bin_locations[..., -1] += eps + return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 + + +def unconstrained_rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails="linear", + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) + outside_interval_mask = ~inside_interval_mask + + outputs = torch.zeros_like(inputs) + logabsdet = torch.zeros_like(inputs) + + if tails == "linear": + unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) + constant = np.log(np.exp(1 - min_derivative) - 1) + unnormalized_derivatives[..., 0] = constant + unnormalized_derivatives[..., -1] = constant + + outputs[outside_interval_mask] = inputs[outside_interval_mask] + logabsdet[outside_interval_mask] = 0 + else: + raise RuntimeError("{} tails are not implemented.".format(tails)) + + ( + outputs[inside_interval_mask], + logabsdet[inside_interval_mask], + ) = rational_quadratic_spline( + inputs=inputs[inside_interval_mask], + unnormalized_widths=unnormalized_widths[inside_interval_mask, :], + unnormalized_heights=unnormalized_heights[inside_interval_mask, :], + unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], + inverse=inverse, + left=-tail_bound, + right=tail_bound, + bottom=-tail_bound, + top=tail_bound, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + ) + + return outputs, logabsdet + + +def rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + left=0.0, + right=1.0, + bottom=0.0, + top=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if torch.min(inputs) < left or torch.max(inputs) > right: + raise ValueError("Input to a transform is not within its domain") + + num_bins = unnormalized_widths.shape[-1] + + if min_bin_width * num_bins > 1.0: + raise ValueError("Minimal bin width too large for the number of bins") + if min_bin_height * num_bins > 1.0: + raise ValueError("Minimal bin height too large for the number of bins") + + widths = F.softmax(unnormalized_widths, dim=-1) + widths = min_bin_width + (1 - min_bin_width * num_bins) * widths + cumwidths = torch.cumsum(widths, dim=-1) + cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) + cumwidths = (right - left) * cumwidths + left + cumwidths[..., 0] = left + cumwidths[..., -1] = right + widths = cumwidths[..., 1:] - cumwidths[..., :-1] + + derivatives = min_derivative + F.softplus(unnormalized_derivatives) + + heights = F.softmax(unnormalized_heights, dim=-1) + heights = min_bin_height + (1 - min_bin_height * num_bins) * heights + cumheights = torch.cumsum(heights, dim=-1) + cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) + cumheights = (top - bottom) * cumheights + bottom + cumheights[..., 0] = bottom + cumheights[..., -1] = top + heights = cumheights[..., 1:] - cumheights[..., :-1] + + if inverse: + bin_idx = searchsorted(cumheights, inputs)[..., None] + else: + bin_idx = searchsorted(cumwidths, inputs)[..., None] + + input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] + input_bin_widths = widths.gather(-1, bin_idx)[..., 0] + + input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] + delta = heights / widths + input_delta = delta.gather(-1, bin_idx)[..., 0] + + input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] + input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] + + input_heights = heights.gather(-1, bin_idx)[..., 0] + + if inverse: + a = (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + input_heights * (input_delta - input_derivatives) + b = input_heights * input_derivatives - (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + c = -input_delta * (inputs - input_cumheights) + + discriminant = b.pow(2) - 4 * a * c + assert (discriminant >= 0).all() + + root = (2 * c) / (-b - torch.sqrt(discriminant)) + outputs = root * input_bin_widths + input_cumwidths + + theta_one_minus_theta = root * (1 - root) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * root.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - root).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, -logabsdet + else: + theta = (inputs - input_cumwidths) / input_bin_widths + theta_one_minus_theta = theta * (1 - theta) + + numerator = input_heights * ( + input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta + ) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + outputs = input_cumheights + numerator / denominator + + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * theta.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - theta).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, logabsdet diff --git a/infer/lib/jit/__init__.py b/infer/lib/jit/__init__.py new file mode 100644 index 0000000..d7f41dd --- /dev/null +++ b/infer/lib/jit/__init__.py @@ -0,0 +1,163 @@ +from io import BytesIO +import pickle +import time +import torch +from tqdm import tqdm +from collections import OrderedDict + + +def load_inputs(path, device, is_half=False): + parm = torch.load(path, map_location=torch.device("cpu")) + for key in parm.keys(): + parm[key] = parm[key].to(device) + if is_half and parm[key].dtype == torch.float32: + parm[key] = parm[key].half() + elif not is_half and parm[key].dtype == torch.float16: + parm[key] = parm[key].float() + return parm + + +def benchmark( + model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False +): + parm = load_inputs(inputs_path, device, is_half) + total_ts = 0.0 + bar = tqdm(range(epoch)) + for i in bar: + start_time = time.perf_counter() + o = model(**parm) + total_ts += time.perf_counter() - start_time + print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}") + + +def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False): + benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half) + + +def to_jit_model( + model_path, + model_type: str, + mode: str = "trace", + inputs_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + model = None + if model_type.lower() == "synthesizer": + from .get_synthesizer import get_synthesizer + + model, _ = get_synthesizer(model_path, device) + model.forward = model.infer + elif model_type.lower() == "rmvpe": + from .get_rmvpe import get_rmvpe + + model = get_rmvpe(model_path, device) + elif model_type.lower() == "hubert": + from .get_hubert import get_hubert_model + + model = get_hubert_model(model_path, device) + model.forward = model.infer + else: + raise ValueError(f"No model type named {model_type}") + model = model.eval() + model = model.half() if is_half else model.float() + if mode == "trace": + assert not inputs_path + inputs = load_inputs(inputs_path, device, is_half) + model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) + elif mode == "script": + model_jit = torch.jit.script(model) + model_jit.to(device) + model_jit = model_jit.half() if is_half else model_jit.float() + # model = model.half() if is_half else model.float() + return (model, model_jit) + + +def export( + model: torch.nn.Module, + mode: str = "trace", + inputs: dict = None, + device=torch.device("cpu"), + is_half: bool = False, +) -> dict: + model = model.half() if is_half else model.float() + model.eval() + if mode == "trace": + assert inputs is not None + model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) + elif mode == "script": + model_jit = torch.jit.script(model) + model_jit.to(device) + model_jit = model_jit.half() if is_half else model_jit.float() + buffer = BytesIO() + # model_jit=model_jit.cpu() + torch.jit.save(model_jit, buffer) + del model_jit + cpt = OrderedDict() + cpt["model"] = buffer.getvalue() + cpt["is_half"] = is_half + return cpt + + +def load(path: str): + with open(path, "rb") as f: + return pickle.load(f) + + +def save(ckpt: dict, save_path: str): + with open(save_path, "wb") as f: + pickle.dump(ckpt, f) + + +def rmvpe_jit_export( + model_path: str, + mode: str = "script", + inputs_path: str = None, + save_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + if not save_path: + save_path = model_path.rstrip(".pth") + save_path += ".half.jit" if is_half else ".jit" + if "cuda" in str(device) and ":" not in str(device): + device = torch.device("cuda:0") + from .get_rmvpe import get_rmvpe + + model = get_rmvpe(model_path, device) + inputs = None + if mode == "trace": + inputs = load_inputs(inputs_path, device, is_half) + ckpt = export(model, mode, inputs, device, is_half) + ckpt["device"] = str(device) + save(ckpt, save_path) + return ckpt + + +def synthesizer_jit_export( + model_path: str, + mode: str = "script", + inputs_path: str = None, + save_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + if not save_path: + save_path = model_path.rstrip(".pth") + save_path += ".half.jit" if is_half else ".jit" + if "cuda" in str(device) and ":" not in str(device): + device = torch.device("cuda:0") + from .get_synthesizer import get_synthesizer + + model, cpt = get_synthesizer(model_path, device) + assert isinstance(cpt, dict) + model.forward = model.infer + inputs = None + if mode == "trace": + inputs = load_inputs(inputs_path, device, is_half) + ckpt = export(model, mode, inputs, device, is_half) + cpt.pop("weight") + cpt["model"] = ckpt["model"] + cpt["device"] = device + save(cpt, save_path) + return cpt diff --git a/infer/lib/jit/get_hubert.py b/infer/lib/jit/get_hubert.py new file mode 100644 index 0000000..aec7132 --- /dev/null +++ b/infer/lib/jit/get_hubert.py @@ -0,0 +1,342 @@ +import math +import random +from typing import Optional, Tuple +from fairseq.checkpoint_utils import load_model_ensemble_and_task +import numpy as np +import torch +import torch.nn.functional as F + +# from fairseq.data.data_utils import compute_mask_indices +from fairseq.utils import index_put + + +# @torch.jit.script +def pad_to_multiple(x, multiple, dim=-1, value=0): + # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41 + if x is None: + return None, 0 + tsz = x.size(dim) + m = tsz / multiple + remainder = math.ceil(m) * multiple - tsz + if int(tsz % multiple) == 0: + return x, 0 + pad_offset = (0,) * (-1 - dim) * 2 + + return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder + + +def extract_features( + self, + x, + padding_mask=None, + tgt_layer=None, + min_layer=0, +): + if padding_mask is not None: + x = index_put(x, padding_mask, 0) + + x_conv = self.pos_conv(x.transpose(1, 2)) + x_conv = x_conv.transpose(1, 2) + x = x + x_conv + + if not self.layer_norm_first: + x = self.layer_norm(x) + + # pad to the sequence length dimension + x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0) + if pad_length > 0 and padding_mask is None: + padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool) + padding_mask[:, -pad_length:] = True + else: + padding_mask, _ = pad_to_multiple( + padding_mask, self.required_seq_len_multiple, dim=-1, value=True + ) + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + layer_results = [] + r = None + for i, layer in enumerate(self.layers): + dropout_probability = np.random.random() if self.layerdrop > 0 else 1 + if not self.training or (dropout_probability > self.layerdrop): + x, (z, lr) = layer( + x, self_attn_padding_mask=padding_mask, need_weights=False + ) + if i >= min_layer: + layer_results.append((x, z, lr)) + if i == tgt_layer: + r = x + break + + if r is not None: + x = r + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + # undo paddding + if pad_length > 0: + x = x[:, :-pad_length] + + def undo_pad(a, b, c): + return ( + a[:-pad_length], + b[:-pad_length] if b is not None else b, + c[:-pad_length], + ) + + layer_results = [undo_pad(*u) for u in layer_results] + + return x, layer_results + + +def compute_mask_indices( + shape: Tuple[int, int], + padding_mask: Optional[torch.Tensor], + mask_prob: float, + mask_length: int, + mask_type: str = "static", + mask_other: float = 0.0, + min_masks: int = 0, + no_overlap: bool = False, + min_space: int = 0, + require_same_masks: bool = True, + mask_dropout: float = 0.0, +) -> torch.Tensor: + """ + Computes random mask spans for a given shape + + Args: + shape: the the shape for which to compute masks. + should be of size 2 where first element is batch size and 2nd is timesteps + padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements + mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by + number of timesteps divided by length of mask span to mask approximately this percentage of all elements. + however due to overlaps, the actual number will be smaller (unless no_overlap is True) + mask_type: how to compute mask lengths + static = fixed size + uniform = sample from uniform distribution [mask_other, mask_length*2] + normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element + poisson = sample from possion distribution with lambda = mask length + min_masks: minimum number of masked spans + no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping + min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans + require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample + mask_dropout: randomly dropout this percentage of masks in each example + """ + + bsz, all_sz = shape + mask = torch.full((bsz, all_sz), False) + + all_num_mask = int( + # add a random number for probabilistic rounding + mask_prob * all_sz / float(mask_length) + + torch.rand([1]).item() + ) + + all_num_mask = max(min_masks, all_num_mask) + + mask_idcs = [] + for i in range(bsz): + if padding_mask is not None: + sz = all_sz - padding_mask[i].long().sum().item() + num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand()) + num_mask = max(min_masks, num_mask) + else: + sz = all_sz + num_mask = all_num_mask + + if mask_type == "static": + lengths = torch.full([num_mask], mask_length) + elif mask_type == "uniform": + lengths = torch.randint(mask_other, mask_length * 2 + 1, size=[num_mask]) + elif mask_type == "normal": + lengths = torch.normal(mask_length, mask_other, size=[num_mask]) + lengths = [max(1, int(round(x))) for x in lengths] + else: + raise Exception("unknown mask selection " + mask_type) + + if sum(lengths) == 0: + lengths[0] = min(mask_length, sz - 1) + + if no_overlap: + mask_idc = [] + + def arrange(s, e, length, keep_length): + span_start = torch.randint(low=s, high=e - length, size=[1]).item() + mask_idc.extend(span_start + i for i in range(length)) + + new_parts = [] + if span_start - s - min_space >= keep_length: + new_parts.append((s, span_start - min_space + 1)) + if e - span_start - length - min_space > keep_length: + new_parts.append((span_start + length + min_space, e)) + return new_parts + + parts = [(0, sz)] + min_length = min(lengths) + for length in sorted(lengths, reverse=True): + t = [e - s if e - s >= length + min_space else 0 for s, e in parts] + lens = torch.asarray(t, dtype=torch.int) + l_sum = torch.sum(lens) + if l_sum == 0: + break + probs = lens / torch.sum(lens) + c = torch.multinomial(probs.float(), len(parts)).item() + s, e = parts.pop(c) + parts.extend(arrange(s, e, length, min_length)) + mask_idc = torch.asarray(mask_idc) + else: + min_len = min(lengths) + if sz - min_len <= num_mask: + min_len = sz - num_mask - 1 + mask_idc = torch.asarray( + random.sample([i for i in range(sz - min_len)], num_mask) + ) + mask_idc = torch.asarray( + [ + mask_idc[j] + offset + for j in range(len(mask_idc)) + for offset in range(lengths[j]) + ] + ) + + mask_idcs.append(torch.unique(mask_idc[mask_idc < sz])) + + min_len = min([len(m) for m in mask_idcs]) + for i, mask_idc in enumerate(mask_idcs): + if isinstance(mask_idc, torch.Tensor): + mask_idc = torch.asarray(mask_idc, dtype=torch.float) + if len(mask_idc) > min_len and require_same_masks: + mask_idc = torch.asarray( + random.sample([i for i in range(mask_idc)], min_len) + ) + if mask_dropout > 0: + num_holes = int(round(len(mask_idc) * mask_dropout)) + mask_idc = torch.asarray( + random.sample([i for i in range(mask_idc)], len(mask_idc) - num_holes) + ) + + mask[i, mask_idc.int()] = True + + return mask + + +def apply_mask(self, x, padding_mask, target_list): + B, T, C = x.shape + torch.zeros_like(x) + if self.mask_prob > 0: + mask_indices = compute_mask_indices( + (B, T), + padding_mask, + self.mask_prob, + self.mask_length, + self.mask_selection, + self.mask_other, + min_masks=2, + no_overlap=self.no_mask_overlap, + min_space=self.mask_min_space, + ) + mask_indices = mask_indices.to(x.device) + x[mask_indices] = self.mask_emb + else: + mask_indices = None + + if self.mask_channel_prob > 0: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + mask_channel_indices.to(x.device).unsqueeze(1).expand(-1, T, -1) + ) + x[mask_channel_indices] = 0 + + return x, mask_indices + + +def get_hubert_model( + model_path="assets/hubert/hubert_base.pt", device=torch.device("cpu") +): + models, _, _ = load_model_ensemble_and_task( + [model_path], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(device) + + def _apply_mask(x, padding_mask, target_list): + return apply_mask(hubert_model, x, padding_mask, target_list) + + hubert_model.apply_mask = _apply_mask + + def _extract_features( + x, + padding_mask=None, + tgt_layer=None, + min_layer=0, + ): + return extract_features( + hubert_model.encoder, + x, + padding_mask=padding_mask, + tgt_layer=tgt_layer, + min_layer=min_layer, + ) + + hubert_model.encoder.extract_features = _extract_features + + hubert_model._forward = hubert_model.forward + + def hubert_extract_features( + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = False, + ret_conv: bool = False, + output_layer: Optional[int] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + res = self._forward( + source, + padding_mask=padding_mask, + mask=mask, + features_only=True, + output_layer=output_layer, + ) + feature = res["features"] if ret_conv else res["x"] + return feature, res["padding_mask"] + + def _hubert_extract_features( + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = False, + ret_conv: bool = False, + output_layer: Optional[int] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + return hubert_extract_features( + hubert_model, source, padding_mask, mask, ret_conv, output_layer + ) + + hubert_model.extract_features = _hubert_extract_features + + def infer(source, padding_mask, output_layer: torch.Tensor): + output_layer = output_layer.item() + logits = hubert_model.extract_features( + source=source, padding_mask=padding_mask, output_layer=output_layer + ) + feats = hubert_model.final_proj(logits[0]) if output_layer == 9 else logits[0] + return feats + + hubert_model.infer = infer + # hubert_model.forward=infer + # hubert_model.forward + + return hubert_model diff --git a/infer/lib/jit/get_rmvpe.py b/infer/lib/jit/get_rmvpe.py new file mode 100644 index 0000000..e71c39f --- /dev/null +++ b/infer/lib/jit/get_rmvpe.py @@ -0,0 +1,12 @@ +import torch + + +def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")): + from infer.lib.rmvpe import E2E + + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location=device) + model.load_state_dict(ckpt) + model.eval() + model = model.to(device) + return model diff --git a/infer/lib/jit/get_synthesizer.py b/infer/lib/jit/get_synthesizer.py new file mode 100644 index 0000000..b8db4fa --- /dev/null +++ b/infer/lib/jit/get_synthesizer.py @@ -0,0 +1,38 @@ +import torch + + +def get_synthesizer(pth_path, device=torch.device("cpu")): + from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, + ) + + cpt = torch.load(pth_path, map_location=torch.device("cpu")) + # tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + if_f0 = cpt.get("f0", 1) + version = cpt.get("version", "v1") + if version == "v1": + if if_f0 == 1: + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif version == "v2": + if if_f0 == 1: + net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False) + else: + net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) + del net_g.enc_q + # net_g.forward = net_g.infer + # ckpt = {} + # ckpt["config"] = cpt["config"] + # ckpt["f0"] = if_f0 + # ckpt["version"] = version + # ckpt["info"] = cpt.get("info", "0epoch") + net_g.load_state_dict(cpt["weight"], strict=False) + net_g = net_g.float() + net_g.eval().to(device) + net_g.remove_weight_norm() + return net_g, cpt diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py new file mode 100644 index 0000000..5e55f97 --- /dev/null +++ b/infer/lib/rmvpe.py @@ -0,0 +1,670 @@ +from io import BytesIO +import os +from typing import List, Optional, Tuple +import numpy as np +import torch + +from infer.lib import jit + +try: + # Fix "Torch not compiled with CUDA enabled" + import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import + + if torch.xpu.is_available(): + from infer.modules.ipex import ipex_init + + ipex_init() +except Exception: # pylint: disable=broad-exception-caught + pass +import torch.nn as nn +import torch.nn.functional as F +from librosa.util import normalize, pad_center, tiny +from scipy.signal import get_window + +import logging + +logger = logging.getLogger(__name__) + + +class STFT(torch.nn.Module): + def __init__( + self, filter_length=1024, hop_length=512, win_length=None, window="hann" + ): + """ + This module implements an STFT using 1D convolution and 1D transpose convolutions. + This is a bit tricky so there are some cases that probably won't work as working + out the same sizes before and after in all overlap add setups is tough. Right now, + this code should work with hop lengths that are half the filter length (50% overlap + between frames). + + Keyword Arguments: + filter_length {int} -- Length of filters used (default: {1024}) + hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512}) + win_length {[type]} -- Length of the window function applied to each frame (if not specified, it + equals the filter length). (default: {None}) + window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris) + (default: {'hann'}) + """ + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length if win_length else filter_length + self.window = window + self.forward_transform = None + self.pad_amount = int(self.filter_length / 2) + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack( + [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] + ) + forward_basis = torch.FloatTensor(fourier_basis) + inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis)) + + assert filter_length >= self.win_length + # get window and zero center pad it to filter_length + fft_window = get_window(window, self.win_length, fftbins=True) + fft_window = pad_center(fft_window, size=filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis = (inverse_basis.T * fft_window).T + + self.register_buffer("forward_basis", forward_basis.float()) + self.register_buffer("inverse_basis", inverse_basis.float()) + self.register_buffer("fft_window", fft_window.float()) + + def transform(self, input_data, return_phase=False): + """Take input data (audio) to STFT domain. + + Arguments: + input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) + + Returns: + magnitude {tensor} -- Magnitude of STFT with shape (num_batch, + num_frequencies, num_frames) + phase {tensor} -- Phase of STFT with shape (num_batch, + num_frequencies, num_frames) + """ + input_data = F.pad( + input_data, + (self.pad_amount, self.pad_amount), + mode="reflect", + ) + forward_transform = input_data.unfold( + 1, self.filter_length, self.hop_length + ).permute(0, 2, 1) + forward_transform = torch.matmul(self.forward_basis, forward_transform) + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + magnitude = torch.sqrt(real_part**2 + imag_part**2) + if return_phase: + phase = torch.atan2(imag_part.data, real_part.data) + return magnitude, phase + else: + return magnitude + + def inverse(self, magnitude, phase): + """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced + by the ```transform``` function. + + Arguments: + magnitude {tensor} -- Magnitude of STFT with shape (num_batch, + num_frequencies, num_frames) + phase {tensor} -- Phase of STFT with shape (num_batch, + num_frequencies, num_frames) + + Returns: + inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of + shape (num_batch, num_samples) + """ + cat = torch.cat( + [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 + ) + fold = torch.nn.Fold( + output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length), + kernel_size=(1, self.filter_length), + stride=(1, self.hop_length), + ) + inverse_transform = torch.matmul(self.inverse_basis, cat) + inverse_transform = fold(inverse_transform)[ + :, 0, 0, self.pad_amount : -self.pad_amount + ] + window_square_sum = ( + self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0) + ) + window_square_sum = fold(window_square_sum)[ + :, 0, 0, self.pad_amount : -self.pad_amount + ] + inverse_transform /= window_square_sum + return inverse_transform + + def forward(self, input_data): + """Take input data (audio) to STFT domain and then back to audio. + + Arguments: + input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) + + Returns: + reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of + shape (num_batch, num_samples) + """ + self.magnitude, self.phase = self.transform(input_data, return_phase=True) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction + + +from time import time as ttime + + +class BiGRU(nn.Module): + def __init__(self, input_features, hidden_features, num_layers): + super(BiGRU, self).__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] + + +class ConvBlockRes(nn.Module): + def __init__(self, in_channels, out_channels, momentum=0.01): + super(ConvBlockRes, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + # self.shortcut:Optional[nn.Module] = None + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + + def forward(self, x: torch.Tensor): + if not hasattr(self, "shortcut"): + return self.conv(x) + x + else: + return self.conv(x) + self.shortcut(x) + + +class Encoder(nn.Module): + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super(Encoder, self).__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, out_channels, kernel_size, n_blocks, momentum=momentum + ) + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x: torch.Tensor): + concat_tensors: List[torch.Tensor] = [] + x = self.bn(x) + for i, layer in enumerate(self.layers): + t, x = layer(x) + concat_tensors.append(t) + return x, concat_tensors + + +class ResEncoderBlock(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 + ): + super(ResEncoderBlock, self).__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i, conv in enumerate(self.conv): + x = conv(x) + if self.kernel_size is not None: + return x, self.pool(x) + else: + return x + + +class Intermediate(nn.Module): # + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super(Intermediate, self).__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) + ) + for i in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) + ) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = layer(x) + return x + + +class ResDecoderBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super(ResDecoderBlock, self).__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i, conv2 in enumerate(self.conv2): + x = conv2(x) + return x + + +class Decoder(nn.Module): + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super(Decoder, self).__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for i in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) + ) + in_channels = out_channels + + def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]): + for i, layer in enumerate(self.layers): + x = layer(x, concat_tensors[-1 - i]) + return x + + +class DeepUnet(nn.Module): + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(DeepUnet, self).__init__() + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, en_de_layers, kernel_size, n_blocks + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +class E2E(nn.Module): + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(E2E, self).__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, 360), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + ) + + def forward(self, mel): + # print(mel.shape) + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + # print(x.shape) + return x + + +from librosa.filters import mel + + +class MelSpectrogram(torch.nn.Module): + def __init__( + self, + is_half, + n_mel_channels, + sampling_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sampling_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sampling_rate = sampling_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + self.is_half = is_half + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + audio.device + ) + if "privateuseone" in str(audio.device): + if not hasattr(self, "stft"): + self.stft = STFT( + filter_length=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window="hann", + ).to(audio.device) + magnitude = self.stft.transform(audio) + else: + fft = torch.stft( + audio, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ) + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + if self.is_half == True: + mel_output = mel_output.half() + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + return log_mel_spec + + +class RMVPE: + def __init__(self, model_path: str, is_half, device=None, use_jit=False): + self.resample_kernel = {} + self.resample_kernel = {} + self.is_half = is_half + if device is None: + device = "cuda:0" if torch.cuda.is_available() else "cpu" + self.device = device + self.mel_extractor = MelSpectrogram( + is_half, 128, 16000, 1024, 160, None, 30, 8000 + ).to(device) + if "privateuseone" in str(device): + import onnxruntime as ort + + ort_session = ort.InferenceSession( + "%s/rmvpe.onnx" % os.environ["rmvpe_root"], + providers=["DmlExecutionProvider"], + ) + self.model = ort_session + else: + if str(self.device) == "cuda": + self.device = torch.device("cuda:0") + + def get_jit_model(): + jit_model_path = model_path.rstrip(".pth") + jit_model_path += ".half.jit" if is_half else ".jit" + reload = False + if os.path.exists(jit_model_path): + ckpt = jit.load(jit_model_path) + model_device = ckpt["device"] + if model_device != str(self.device): + reload = True + else: + reload = True + + if reload: + ckpt = jit.rmvpe_jit_export( + model_path=model_path, + mode="script", + inputs_path=None, + save_path=jit_model_path, + device=device, + is_half=is_half, + ) + model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device) + return model + + def get_default_model(): + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu") + model.load_state_dict(ckpt) + model.eval() + if is_half: + model = model.half() + else: + model = model.float() + return model + + if use_jit: + if is_half and "cpu" in str(self.device): + logger.warning( + "Use default rmvpe model. \ + Jit is not supported on the CPU for half floating point" + ) + self.model = get_default_model() + else: + self.model = get_jit_model() + else: + self.model = get_default_model() + + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(360) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 + + def mel2hidden(self, mel): + with torch.no_grad(): + n_frames = mel.shape[-1] + n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames + if n_pad > 0: + mel = F.pad(mel, (0, n_pad), mode="constant") + if "privateuseone" in str(self.device): + onnx_input_name = self.model.get_inputs()[0].name + onnx_outputs_names = self.model.get_outputs()[0].name + hidden = self.model.run( + [onnx_outputs_names], + input_feed={onnx_input_name: mel.cpu().numpy()}, + )[0] + else: + mel = mel.half() if self.is_half else mel.float() + hidden = self.model(mel) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) + return f0 + + def infer_from_audio(self, audio, thred=0.03): + # torch.cuda.synchronize() + # t0 = ttime() + if not torch.is_tensor(audio): + audio = torch.from_numpy(audio) + mel = self.mel_extractor( + audio.float().to(self.device).unsqueeze(0), center=True + ) + # print(123123123,mel.device.type) + # torch.cuda.synchronize() + # t1 = ttime() + hidden = self.mel2hidden(mel) + # torch.cuda.synchronize() + # t2 = ttime() + # print(234234,hidden.device.type) + if "privateuseone" not in str(self.device): + hidden = hidden.squeeze(0).cpu().numpy() + else: + hidden = hidden[0] + if self.is_half == True: + hidden = hidden.astype("float32") + + f0 = self.decode(hidden, thred=thred) + # torch.cuda.synchronize() + # t3 = ttime() + # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + # t0 = ttime() + center = np.argmax(salience, axis=1) # 帧长#index + salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 + # t1 = ttime() + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + # t2 = ttime() + todo_salience = np.array(todo_salience) # 帧长,9 + todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) # 帧长 + devided = product_sum / weight_sum # 帧长 + # t3 = ttime() + maxx = np.max(salience, axis=1) # 帧长 + devided[maxx <= thred] = 0 + # t4 = ttime() + # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + return devided + + +if __name__ == "__main__": + import librosa + import soundfile as sf + + audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav") + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.transpose(1, 0)) + audio_bak = audio.copy() + if sampling_rate != 16000: + audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) + model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt" + thred = 0.03 # 0.01 + device = "cuda" if torch.cuda.is_available() else "cpu" + rmvpe = RMVPE(model_path, is_half=False, device=device) + t0 = ttime() + f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + t1 = ttime() + logger.info("%s %.2f", f0.shape, t1 - t0) diff --git a/infer/lib/slicer2.py b/infer/lib/slicer2.py new file mode 100644 index 0000000..5b29ee2 --- /dev/null +++ b/infer/lib/slicer2.py @@ -0,0 +1,260 @@ +import numpy as np + + +# This function is obtained from librosa. +def get_rms( + y, + frame_length=2048, + hop_length=512, + pad_mode="constant", +): + padding = (int(frame_length // 2), int(frame_length // 2)) + y = np.pad(y, padding, mode=pad_mode) + + axis = -1 + # put our new within-frame axis at the end for now + out_strides = y.strides + tuple([y.strides[axis]]) + # Reduce the shape on the framing axis + x_shape_trimmed = list(y.shape) + x_shape_trimmed[axis] -= frame_length - 1 + out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) + xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) + if axis < 0: + target_axis = axis - 1 + else: + target_axis = axis + 1 + xw = np.moveaxis(xw, -1, target_axis) + # Downsample along the target axis + slices = [slice(None)] * xw.ndim + slices[axis] = slice(0, None, hop_length) + x = xw[tuple(slices)] + + # Calculate power + power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) + + return np.sqrt(power) + + +class Slicer: + def __init__( + self, + sr: int, + threshold: float = -40.0, + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000, + ): + if not min_length >= min_interval >= hop_size: + raise ValueError( + "The following condition must be satisfied: min_length >= min_interval >= hop_size" + ) + if not max_sil_kept >= hop_size: + raise ValueError( + "The following condition must be satisfied: max_sil_kept >= hop_size" + ) + min_interval = sr * min_interval / 1000 + self.threshold = 10 ** (threshold / 20.0) + self.hop_size = round(sr * hop_size / 1000) + self.win_size = min(round(min_interval), 4 * self.hop_size) + self.min_length = round(sr * min_length / 1000 / self.hop_size) + self.min_interval = round(min_interval / self.hop_size) + self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) + + def _apply_slice(self, waveform, begin, end): + if len(waveform.shape) > 1: + return waveform[ + :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) + ] + else: + return waveform[ + begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) + ] + + # @timeit + def slice(self, waveform): + if len(waveform.shape) > 1: + samples = waveform.mean(axis=0) + else: + samples = waveform + if samples.shape[0] <= self.min_length: + return [waveform] + rms_list = get_rms( + y=samples, frame_length=self.win_size, hop_length=self.hop_size + ).squeeze(0) + sil_tags = [] + silence_start = None + clip_start = 0 + for i, rms in enumerate(rms_list): + # Keep looping while frame is silent. + if rms < self.threshold: + # Record start of silent frames. + if silence_start is None: + silence_start = i + continue + # Keep looping while frame is not silent and silence start has not been recorded. + if silence_start is None: + continue + # Clear recorded silence start if interval is not enough or clip is too short + is_leading_silence = silence_start == 0 and i > self.max_sil_kept + need_slice_middle = ( + i - silence_start >= self.min_interval + and i - clip_start >= self.min_length + ) + if not is_leading_silence and not need_slice_middle: + silence_start = None + continue + # Need slicing. Record the range of silent frames to be removed. + if i - silence_start <= self.max_sil_kept: + pos = rms_list[silence_start : i + 1].argmin() + silence_start + if silence_start == 0: + sil_tags.append((0, pos)) + else: + sil_tags.append((pos, pos)) + clip_start = pos + elif i - silence_start <= self.max_sil_kept * 2: + pos = rms_list[ + i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 + ].argmin() + pos += i - self.max_sil_kept + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + clip_start = pos_r + else: + sil_tags.append((min(pos_l, pos), max(pos_r, pos))) + clip_start = max(pos_r, pos) + else: + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + else: + sil_tags.append((pos_l, pos_r)) + clip_start = pos_r + silence_start = None + # Deal with trailing silence. + total_frames = rms_list.shape[0] + if ( + silence_start is not None + and total_frames - silence_start >= self.min_interval + ): + silence_end = min(total_frames, silence_start + self.max_sil_kept) + pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start + sil_tags.append((pos, total_frames + 1)) + # Apply and return slices. + if len(sil_tags) == 0: + return [waveform] + else: + chunks = [] + if sil_tags[0][0] > 0: + chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) + for i in range(len(sil_tags) - 1): + chunks.append( + self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]) + ) + if sil_tags[-1][1] < total_frames: + chunks.append( + self._apply_slice(waveform, sil_tags[-1][1], total_frames) + ) + return chunks + + +def main(): + import os.path + from argparse import ArgumentParser + + import librosa + import soundfile + + parser = ArgumentParser() + parser.add_argument("audio", type=str, help="The audio to be sliced") + parser.add_argument( + "--out", type=str, help="Output directory of the sliced audio clips" + ) + parser.add_argument( + "--db_thresh", + type=float, + required=False, + default=-40, + help="The dB threshold for silence detection", + ) + parser.add_argument( + "--min_length", + type=int, + required=False, + default=5000, + help="The minimum milliseconds required for each sliced audio clip", + ) + parser.add_argument( + "--min_interval", + type=int, + required=False, + default=300, + help="The minimum milliseconds for a silence part to be sliced", + ) + parser.add_argument( + "--hop_size", + type=int, + required=False, + default=10, + help="Frame length in milliseconds", + ) + parser.add_argument( + "--max_sil_kept", + type=int, + required=False, + default=500, + help="The maximum silence length kept around the sliced clip, presented in milliseconds", + ) + args = parser.parse_args() + out = args.out + if out is None: + out = os.path.dirname(os.path.abspath(args.audio)) + audio, sr = librosa.load(args.audio, sr=None, mono=False) + slicer = Slicer( + sr=sr, + threshold=args.db_thresh, + min_length=args.min_length, + min_interval=args.min_interval, + hop_size=args.hop_size, + max_sil_kept=args.max_sil_kept, + ) + chunks = slicer.slice(audio) + if not os.path.exists(out): + os.makedirs(out) + for i, chunk in enumerate(chunks): + if len(chunk.shape) > 1: + chunk = chunk.T + soundfile.write( + os.path.join( + out, + f"%s_%d.wav" + % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + ), + chunk, + sr, + ) + + +if __name__ == "__main__": + main() diff --git a/infer/lib/train/data_utils.py b/infer/lib/train/data_utils.py new file mode 100644 index 0000000..21399a3 --- /dev/null +++ b/infer/lib/train/data_utils.py @@ -0,0 +1,517 @@ +import os +import traceback +import logging + +logger = logging.getLogger(__name__) + +import numpy as np +import torch +import torch.utils.data + +from infer.lib.train.mel_processing import spectrogram_torch +from infer.lib.train.utils import load_filepaths_and_text, load_wav_to_torch + + +class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): + """ + 1) loads audio, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filter text & store spec lengths + """ + # Store spectrogram lengths for Bucketing + # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) + # spec_length = wav_length // hop_length + audiopaths_and_text_new = [] + lengths = [] + for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) + lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) + self.audiopaths_and_text = audiopaths_and_text_new + self.lengths = lengths + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def get_audio_text_pair(self, audiopath_and_text): + # separate filename and text + file = audiopath_and_text[0] + phone = audiopath_and_text[1] + pitch = audiopath_and_text[2] + pitchf = audiopath_and_text[3] + dv = audiopath_and_text[4] + + phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + # print(123,phone.shape,pitch.shape,spec.shape) + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + # amor + len_wav = len_min * self.hop_length + + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + + phone = phone[:len_min, :] + pitch = pitch[:len_min] + pitchf = pitchf[:len_min] + + return (spec, wav, phone, pitch, pitchf, dv) + + def get_labels(self, phone, pitch, pitchf): + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + pitch = np.load(pitch) + pitchf = np.load(pitchf) + n_num = min(phone.shape[0], 900) # DistributedBucketSampler + # print(234,phone.shape,pitch.shape) + phone = phone[:n_num, :] + pitch = pitch[:n_num] + pitchf = pitchf[:n_num] + phone = torch.FloatTensor(phone) + pitch = torch.LongTensor(pitch) + pitchf = torch.FloatTensor(pitchf) + return phone, pitch, pitchf + + def get_audio(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + audio_norm = audio + # audio_norm = audio / self.max_wav_value + # audio_norm = audio / np.abs(audio).max() + + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename) + except: + logger.warning("%s %s", spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + return self.get_audio_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextAudioCollateMultiNSFsid: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text and aduio + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) # (spec, wav, phone, pitch) + pitch_padded = torch.LongTensor(len(batch), max_phone_len) + pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) + phone_padded.zero_() + pitch_padded.zero_() + pitchf_padded.zero_() + # dv = torch.FloatTensor(len(batch), 256)#gin=256 + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + pitch = row[3] + pitch_padded[i, : pitch.size(0)] = pitch + pitchf = row[4] + pitchf_padded[i, : pitchf.size(0)] = pitchf + + # dv[i] = row[5] + sid[i] = row[5] + + return ( + phone_padded, + phone_lengths, + pitch_padded, + pitchf_padded, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + # dv + sid, + ) + + +class TextAudioLoader(torch.utils.data.Dataset): + """ + 1) loads audio, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filter text & store spec lengths + """ + # Store spectrogram lengths for Bucketing + # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) + # spec_length = wav_length // hop_length + audiopaths_and_text_new = [] + lengths = [] + for audiopath, text, dv in self.audiopaths_and_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_and_text_new.append([audiopath, text, dv]) + lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) + self.audiopaths_and_text = audiopaths_and_text_new + self.lengths = lengths + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def get_audio_text_pair(self, audiopath_and_text): + # separate filename and text + file = audiopath_and_text[0] + phone = audiopath_and_text[1] + dv = audiopath_and_text[2] + + phone = self.get_labels(phone) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + len_wav = len_min * self.hop_length + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + phone = phone[:len_min, :] + return (spec, wav, phone, dv) + + def get_labels(self, phone): + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + n_num = min(phone.shape[0], 900) # DistributedBucketSampler + phone = phone[:n_num, :] + phone = torch.FloatTensor(phone) + return phone + + def get_audio(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + audio_norm = audio + # audio_norm = audio / self.max_wav_value + # audio_norm = audio / np.abs(audio).max() + + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename) + except: + logger.warning("%s %s", spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + return self.get_audio_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextAudioCollate: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text and aduio + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) + phone_padded.zero_() + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + sid[i] = row[3] + + return ( + phone_padded, + phone_lengths, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + sid, + ) + + +class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): + """ + Maintain similar input lengths in a batch. + Length groups are specified by boundaries. + Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. + + It removes samples which are not included in the boundaries. + Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. + """ + + def __init__( + self, + dataset, + batch_size, + boundaries, + num_replicas=None, + rank=None, + shuffle=True, + ): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + self.lengths = dataset.lengths + self.batch_size = batch_size + self.boundaries = boundaries + + self.buckets, self.num_samples_per_bucket = self._create_buckets() + self.total_size = sum(self.num_samples_per_bucket) + self.num_samples = self.total_size // self.num_replicas + + def _create_buckets(self): + buckets = [[] for _ in range(len(self.boundaries) - 1)] + for i in range(len(self.lengths)): + length = self.lengths[i] + idx_bucket = self._bisect(length) + if idx_bucket != -1: + buckets[idx_bucket].append(i) + + for i in range(len(buckets) - 1, -1, -1): # + if len(buckets[i]) == 0: + buckets.pop(i) + self.boundaries.pop(i + 1) + + num_samples_per_bucket = [] + for i in range(len(buckets)): + len_bucket = len(buckets[i]) + total_batch_size = self.num_replicas * self.batch_size + rem = ( + total_batch_size - (len_bucket % total_batch_size) + ) % total_batch_size + num_samples_per_bucket.append(len_bucket + rem) + return buckets, num_samples_per_bucket + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + if self.shuffle: + for bucket in self.buckets: + indices.append(torch.randperm(len(bucket), generator=g).tolist()) + else: + for bucket in self.buckets: + indices.append(list(range(len(bucket)))) + + batches = [] + for i in range(len(self.buckets)): + bucket = self.buckets[i] + len_bucket = len(bucket) + ids_bucket = indices[i] + num_samples_bucket = self.num_samples_per_bucket[i] + + # add extra samples to make it evenly divisible + rem = num_samples_bucket - len_bucket + ids_bucket = ( + ids_bucket + + ids_bucket * (rem // len_bucket) + + ids_bucket[: (rem % len_bucket)] + ) + + # subsample + ids_bucket = ids_bucket[self.rank :: self.num_replicas] + + # batching + for j in range(len(ids_bucket) // self.batch_size): + batch = [ + bucket[idx] + for idx in ids_bucket[ + j * self.batch_size : (j + 1) * self.batch_size + ] + ] + batches.append(batch) + + if self.shuffle: + batch_ids = torch.randperm(len(batches), generator=g).tolist() + batches = [batches[i] for i in batch_ids] + self.batches = batches + + assert len(self.batches) * self.batch_size == self.num_samples + return iter(self.batches) + + def _bisect(self, x, lo=0, hi=None): + if hi is None: + hi = len(self.boundaries) - 1 + + if hi > lo: + mid = (hi + lo) // 2 + if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: + return mid + elif x <= self.boundaries[mid]: + return self._bisect(x, lo, mid) + else: + return self._bisect(x, mid + 1, hi) + else: + return -1 + + def __len__(self): + return self.num_samples // self.batch_size diff --git a/infer/lib/train/losses.py b/infer/lib/train/losses.py new file mode 100644 index 0000000..b1b263e --- /dev/null +++ b/infer/lib/train/losses.py @@ -0,0 +1,58 @@ +import torch + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + rl = rl.float().detach() + gl = gl.float() + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + dr = dr.float() + dg = dg.float() + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg**2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + dg = dg.float() + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + + +def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): + """ + z_p, logs_q: [b, h, t_t] + m_p, logs_p: [b, h, t_t] + """ + z_p = z_p.float() + logs_q = logs_q.float() + m_p = m_p.float() + logs_p = logs_p.float() + z_mask = z_mask.float() + + kl = logs_p - logs_q - 0.5 + kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) + kl = torch.sum(kl * z_mask) + l = kl / torch.sum(z_mask) + return l diff --git a/infer/lib/train/mel_processing.py b/infer/lib/train/mel_processing.py new file mode 100644 index 0000000..89744af --- /dev/null +++ b/infer/lib/train/mel_processing.py @@ -0,0 +1,127 @@ +import torch +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn +import logging + +logger = logging.getLogger(__name__) + +MAX_WAV_VALUE = 32768.0 + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + return dynamic_range_compression_torch(magnitudes) + + +def spectral_de_normalize_torch(magnitudes): + return dynamic_range_decompression_torch(magnitudes) + + +# Reusable banks +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + """Convert waveform into Linear-frequency Linear-amplitude spectrogram. + + Args: + y :: (B, T) - Audio waveforms + n_fft + sampling_rate + hop_size + win_size + center + Returns: + :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram + """ + + # Window - Cache if needed + global hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) + + # Padding + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + + # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) + spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + # MelBasis - Cache if needed + global mel_basis + dtype_device = str(spec.dtype) + "_" + str(spec.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn( + sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + ) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=spec.dtype, device=spec.device + ) + + # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame) + melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) + melspec = spectral_normalize_torch(melspec) + return melspec + + +def mel_spectrogram_torch( + y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False +): + """Convert waveform into Mel-frequency Log-amplitude spectrogram. + + Args: + y :: (B, T) - Waveforms + Returns: + melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram + """ + # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) + spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) + + # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) + melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) + + return melspec diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py new file mode 100644 index 0000000..2529ccf --- /dev/null +++ b/infer/lib/train/process_ckpt.py @@ -0,0 +1,261 @@ +import os +import sys +import traceback +from collections import OrderedDict + +import torch + +from i18n.i18n import I18nAuto + +i18n = I18nAuto() + + +def savee(ckpt, sr, if_f0, name, epoch, version, hps): + try: + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = ckpt[key].half() + opt["config"] = [ + hps.data.filter_length // 2 + 1, + 32, + hps.model.inter_channels, + hps.model.hidden_channels, + hps.model.filter_channels, + hps.model.n_heads, + hps.model.n_layers, + hps.model.kernel_size, + hps.model.p_dropout, + hps.model.resblock, + hps.model.resblock_kernel_sizes, + hps.model.resblock_dilation_sizes, + hps.model.upsample_rates, + hps.model.upsample_initial_channel, + hps.model.upsample_kernel_sizes, + hps.model.spk_embed_dim, + hps.model.gin_channels, + hps.data.sampling_rate, + ] + opt["info"] = "%sepoch" % epoch + opt["sr"] = sr + opt["f0"] = if_f0 + opt["version"] = version + torch.save(opt, "assets/weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() + + +def show_info(path): + try: + a = torch.load(path, map_location="cpu") + return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % ( + a.get("info", "None"), + a.get("sr", "None"), + a.get("f0", "None"), + a.get("version", "None"), + ) + except: + return traceback.format_exc() + + +def extract_small_model(path, name, sr, if_f0, info, version): + try: + ckpt = torch.load(path, map_location="cpu") + if "model" in ckpt: + ckpt = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = ckpt[key].half() + if sr == "40k": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 10, 2, 2], + 512, + [16, 16, 4, 4], + 109, + 256, + 40000, + ] + elif sr == "48k": + if version == "v1": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 6, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 48000, + ] + else: + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [12, 10, 2, 2], + 512, + [24, 20, 4, 4], + 109, + 256, + 48000, + ] + elif sr == "32k": + if version == "v1": + opt["config"] = [ + 513, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 4, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 32000, + ] + else: + opt["config"] = [ + 513, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 8, 2, 2], + 512, + [20, 16, 4, 4], + 109, + 256, + 32000, + ] + if info == "": + info = "Extracted model." + opt["info"] = info + opt["version"] = version + opt["sr"] = sr + opt["f0"] = int(if_f0) + torch.save(opt, "assets/weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() + + +def change_info(path, info, name): + try: + ckpt = torch.load(path, map_location="cpu") + ckpt["info"] = info + if name == "": + name = os.path.basename(path) + torch.save(ckpt, "assets/weights/%s" % name) + return "Success." + except: + return traceback.format_exc() + + +def merge(path1, path2, alpha1, sr, f0, info, name, version): + try: + + def extract(ckpt): + a = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in a.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = a[key] + return opt + + ckpt1 = torch.load(path1, map_location="cpu") + ckpt2 = torch.load(path2, map_location="cpu") + cfg = ckpt1["config"] + if "model" in ckpt1: + ckpt1 = extract(ckpt1) + else: + ckpt1 = ckpt1["weight"] + if "model" in ckpt2: + ckpt2 = extract(ckpt2) + else: + ckpt2 = ckpt2["weight"] + if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): + return "Fail to merge the models. The model architectures are not the same." + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt1.keys(): + # try: + if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: + min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) + opt["weight"][key] = ( + alpha1 * (ckpt1[key][:min_shape0].float()) + + (1 - alpha1) * (ckpt2[key][:min_shape0].float()) + ).half() + else: + opt["weight"][key] = ( + alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float()) + ).half() + # except: + # pdb.set_trace() + opt["config"] = cfg + """ + if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000] + elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000] + elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] + """ + opt["sr"] = sr + opt["f0"] = 1 if f0 == i18n("是") else 0 + opt["version"] = version + opt["info"] = info + torch.save(opt, "assets/weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py new file mode 100644 index 0000000..60a1a26 --- /dev/null +++ b/infer/lib/train/utils.py @@ -0,0 +1,478 @@ +import argparse +import glob +import json +import logging +import os +import subprocess +import sys +import shutil + +import numpy as np +import torch +from scipy.io.wavfile import read + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + + ################## + def go(model, bkey): + saved_state_dict = checkpoint_dict[bkey] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + logger.warning( + "shape-%s-mismatch. need: %s, get: %s", + k, + state_dict[k].shape, + saved_state_dict[k].shape, + ) # + raise KeyError + except: + # logger.info(traceback.format_exc()) + logger.info("%s is not in the checkpoint", k) # pretrain缺失的 + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + return model + + go(combd, "combd") + model = go(sbd, "sbd") + ############# + logger.info("Loaded model weights") + + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None and load_opt == 1 + ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch + # try: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + # except: + # traceback.print_exc() + logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +# def load_checkpoint(checkpoint_path, model, optimizer=None): +# assert os.path.isfile(checkpoint_path) +# checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') +# iteration = checkpoint_dict['iteration'] +# learning_rate = checkpoint_dict['learning_rate'] +# if optimizer is not None: +# optimizer.load_state_dict(checkpoint_dict['optimizer']) +# # print(1111) +# saved_state_dict = checkpoint_dict['model'] +# # print(1111) +# +# if hasattr(model, 'module'): +# state_dict = model.module.state_dict() +# else: +# state_dict = model.state_dict() +# new_state_dict= {} +# for k, v in state_dict.items(): +# try: +# new_state_dict[k] = saved_state_dict[k] +# except: +# logger.info("%s is not in the checkpoint" % k) +# new_state_dict[k] = v +# if hasattr(model, 'module'): +# model.module.load_state_dict(new_state_dict) +# else: +# model.load_state_dict(new_state_dict) +# logger.info("Loaded checkpoint '{}' (epoch {})" .format( +# checkpoint_path, iteration)) +# return model, optimizer, learning_rate, iteration +def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + logger.warning( + "shape-%s-mismatch|need-%s|get-%s", + k, + state_dict[k].shape, + saved_state_dict[k].shape, + ) # + raise KeyError + except: + # logger.info(traceback.format_exc()) + logger.info("%s is not in the checkpoint", k) # pretrain缺失的 + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + logger.info("Loaded model weights") + + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None and load_opt == 1 + ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch + # try: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + # except: + # traceback.print_exc() + logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at epoch {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save( + { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at epoch {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(combd, "module"): + state_dict_combd = combd.module.state_dict() + else: + state_dict_combd = combd.state_dict() + if hasattr(sbd, "module"): + state_dict_sbd = sbd.module.state_dict() + else: + state_dict_sbd = sbd.state_dict() + torch.save( + { + "combd": state_dict_combd, + "sbd": state_dict_sbd, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sampling_rate=22050, +): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + logger.debug(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow( + alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + ) + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def get_hparams(init=True): + """ + todo: + 结尾七人组: + 保存频率、总epoch done + bs done + pretrainG、pretrainD done + 卡号:os.en["CUDA_VISIBLE_DEVICES"] done + if_latest done + 模型:if_f0 done + 采样率:自动选择config done + 是否缓存数据集进GPU:if_cache_data_in_gpu done + + -m: + 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done + -c不要了 + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "-se", + "--save_every_epoch", + type=int, + required=True, + help="checkpoint save frequency (epoch)", + ) + parser.add_argument( + "-te", "--total_epoch", type=int, required=True, help="total_epoch" + ) + parser.add_argument( + "-pg", "--pretrainG", type=str, default="", help="Pretrained Generator path" + ) + parser.add_argument( + "-pd", "--pretrainD", type=str, default="", help="Pretrained Discriminator path" + ) + parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -") + parser.add_argument( + "-bs", "--batch_size", type=int, required=True, help="batch size" + ) + parser.add_argument( + "-e", "--experiment_dir", type=str, required=True, help="experiment dir" + ) # -m + parser.add_argument( + "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k" + ) + parser.add_argument( + "-sw", + "--save_every_weights", + type=str, + default="0", + help="save the extracted model in weights directory when saving checkpoints", + ) + parser.add_argument( + "-v", "--version", type=str, required=True, help="model version" + ) + parser.add_argument( + "-f0", + "--if_f0", + type=int, + required=True, + help="use f0 as one of the inputs of the model, 1 or 0", + ) + parser.add_argument( + "-l", + "--if_latest", + type=int, + required=True, + help="if only save the latest G/D pth file, 1 or 0", + ) + parser.add_argument( + "-c", + "--if_cache_data_in_gpu", + type=int, + required=True, + help="if caching the dataset in GPU memory, 1 or 0", + ) + + args = parser.parse_args() + name = args.experiment_dir + experiment_dir = os.path.join("./logs", args.experiment_dir) + + config_save_path = os.path.join(experiment_dir, "config.json") + with open(config_save_path, "r") as f: + config = json.load(f) + + hparams = HParams(**config) + hparams.model_dir = hparams.experiment_dir = experiment_dir + hparams.save_every_epoch = args.save_every_epoch + hparams.name = name + hparams.total_epoch = args.total_epoch + hparams.pretrainG = args.pretrainG + hparams.pretrainD = args.pretrainD + hparams.version = args.version + hparams.gpus = args.gpus + hparams.train.batch_size = args.batch_size + hparams.sample_rate = args.sample_rate + hparams.if_f0 = args.if_f0 + hparams.if_latest = args.if_latest + hparams.save_every_weights = args.save_every_weights + hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu + hparams.data.training_files = "%s/filelist.txt" % experiment_dir + return hparams + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warning( + "{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + ) + ) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warning( + "git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8] + ) + ) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() diff --git a/infer/lib/uvr5_pack/__pycache__/utils.cpython-310.pyc b/infer/lib/uvr5_pack/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000..52ed028 Binary files /dev/null and b/infer/lib/uvr5_pack/__pycache__/utils.cpython-310.pyc differ diff --git a/infer/lib/uvr5_pack/lib_v5/__pycache__/layers_123821KB.cpython-310.pyc b/infer/lib/uvr5_pack/lib_v5/__pycache__/layers_123821KB.cpython-310.pyc new file mode 100644 index 0000000..baeb54c Binary files /dev/null and b/infer/lib/uvr5_pack/lib_v5/__pycache__/layers_123821KB.cpython-310.pyc differ diff --git a/infer/lib/uvr5_pack/lib_v5/__pycache__/layers_new.cpython-310.pyc b/infer/lib/uvr5_pack/lib_v5/__pycache__/layers_new.cpython-310.pyc new file mode 100644 index 0000000..5d7c122 Binary files /dev/null and b/infer/lib/uvr5_pack/lib_v5/__pycache__/layers_new.cpython-310.pyc differ diff --git a/infer/lib/uvr5_pack/lib_v5/__pycache__/model_param_init.cpython-310.pyc b/infer/lib/uvr5_pack/lib_v5/__pycache__/model_param_init.cpython-310.pyc new file mode 100644 index 0000000..c77d4cf Binary files /dev/null and b/infer/lib/uvr5_pack/lib_v5/__pycache__/model_param_init.cpython-310.pyc differ diff --git a/infer/lib/uvr5_pack/lib_v5/__pycache__/nets_61968KB.cpython-310.pyc b/infer/lib/uvr5_pack/lib_v5/__pycache__/nets_61968KB.cpython-310.pyc new file mode 100644 index 0000000..5cc7c5b Binary files /dev/null and b/infer/lib/uvr5_pack/lib_v5/__pycache__/nets_61968KB.cpython-310.pyc differ diff --git a/infer/lib/uvr5_pack/lib_v5/__pycache__/nets_new.cpython-310.pyc b/infer/lib/uvr5_pack/lib_v5/__pycache__/nets_new.cpython-310.pyc new file mode 100644 index 0000000..c82caac Binary files /dev/null and b/infer/lib/uvr5_pack/lib_v5/__pycache__/nets_new.cpython-310.pyc differ diff --git a/infer/lib/uvr5_pack/lib_v5/__pycache__/spec_utils.cpython-310.pyc b/infer/lib/uvr5_pack/lib_v5/__pycache__/spec_utils.cpython-310.pyc new file mode 100644 index 0000000..ceed546 Binary files /dev/null and b/infer/lib/uvr5_pack/lib_v5/__pycache__/spec_utils.cpython-310.pyc differ diff --git a/infer/lib/uvr5_pack/lib_v5/dataset.py b/infer/lib/uvr5_pack/lib_v5/dataset.py new file mode 100644 index 0000000..cfd01a1 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/dataset.py @@ -0,0 +1,183 @@ +import os +import random + +import numpy as np +import torch +import torch.utils.data +from tqdm import tqdm + +from . import spec_utils + + +class VocalRemoverValidationSet(torch.utils.data.Dataset): + def __init__(self, patch_list): + self.patch_list = patch_list + + def __len__(self): + return len(self.patch_list) + + def __getitem__(self, idx): + path = self.patch_list[idx] + data = np.load(path) + + X, y = data["X"], data["y"] + + X_mag = np.abs(X) + y_mag = np.abs(y) + + return X_mag, y_mag + + +def make_pair(mix_dir, inst_dir): + input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"] + + X_list = sorted( + [ + os.path.join(mix_dir, fname) + for fname in os.listdir(mix_dir) + if os.path.splitext(fname)[1] in input_exts + ] + ) + y_list = sorted( + [ + os.path.join(inst_dir, fname) + for fname in os.listdir(inst_dir) + if os.path.splitext(fname)[1] in input_exts + ] + ) + + filelist = list(zip(X_list, y_list)) + + return filelist + + +def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): + if split_mode == "random": + filelist = make_pair( + os.path.join(dataset_dir, "mixtures"), + os.path.join(dataset_dir, "instruments"), + ) + + random.shuffle(filelist) + + if len(val_filelist) == 0: + val_size = int(len(filelist) * val_rate) + train_filelist = filelist[:-val_size] + val_filelist = filelist[-val_size:] + else: + train_filelist = [ + pair for pair in filelist if list(pair) not in val_filelist + ] + elif split_mode == "subdirs": + if len(val_filelist) != 0: + raise ValueError( + "The `val_filelist` option is not available in `subdirs` mode" + ) + + train_filelist = make_pair( + os.path.join(dataset_dir, "training/mixtures"), + os.path.join(dataset_dir, "training/instruments"), + ) + + val_filelist = make_pair( + os.path.join(dataset_dir, "validation/mixtures"), + os.path.join(dataset_dir, "validation/instruments"), + ) + + return train_filelist, val_filelist + + +def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha): + perm = np.random.permutation(len(X)) + for i, idx in enumerate(tqdm(perm)): + if np.random.uniform() < reduction_rate: + y[idx] = spec_utils.reduce_vocal_aggressively( + X[idx], y[idx], reduction_mask + ) + + if np.random.uniform() < 0.5: + # swap channel + X[idx] = X[idx, ::-1] + y[idx] = y[idx, ::-1] + if np.random.uniform() < 0.02: + # mono + X[idx] = X[idx].mean(axis=0, keepdims=True) + y[idx] = y[idx].mean(axis=0, keepdims=True) + if np.random.uniform() < 0.02: + # inst + X[idx] = y[idx] + + if np.random.uniform() < mixup_rate and i < len(perm) - 1: + lam = np.random.beta(mixup_alpha, mixup_alpha) + X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]] + y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]] + + return X, y + + +def make_padding(width, cropsize, offset): + left = offset + roi_size = cropsize - left * 2 + if roi_size == 0: + roi_size = cropsize + right = roi_size - (width % roi_size) + left + + return left, right, roi_size + + +def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset): + len_dataset = patches * len(filelist) + + X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) + y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) + + for i, (X_path, y_path) in enumerate(tqdm(filelist)): + X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) + coef = np.max([np.abs(X).max(), np.abs(y).max()]) + X, y = X / coef, y / coef + + l, r, roi_size = make_padding(X.shape[2], cropsize, offset) + X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") + y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") + + starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches) + ends = starts + cropsize + for j in range(patches): + idx = i * patches + j + X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]] + y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]] + + return X_dataset, y_dataset + + +def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): + patch_list = [] + patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format( + cropsize, sr, hop_length, n_fft, offset + ) + os.makedirs(patch_dir, exist_ok=True) + + for i, (X_path, y_path) in enumerate(tqdm(filelist)): + basename = os.path.splitext(os.path.basename(X_path))[0] + + X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) + coef = np.max([np.abs(X).max(), np.abs(y).max()]) + X, y = X / coef, y / coef + + l, r, roi_size = make_padding(X.shape[2], cropsize, offset) + X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") + y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") + + len_dataset = int(np.ceil(X.shape[2] / roi_size)) + for j in range(len_dataset): + outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j)) + start = j * roi_size + if not os.path.exists(outpath): + np.savez( + outpath, + X=X_pad[:, :, start : start + cropsize], + y=y_pad[:, :, start : start + cropsize], + ) + patch_list.append(outpath) + + return VocalRemoverValidationSet(patch_list) diff --git a/infer/lib/uvr5_pack/lib_v5/layers.py b/infer/lib/uvr5_pack/lib_v5/layers.py new file mode 100644 index 0000000..4fc1b5c --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers.py @@ -0,0 +1,118 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py new file mode 100644 index 0000000..4fc1b5c --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py @@ -0,0 +1,118 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py new file mode 100644 index 0000000..4fc1b5c --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py @@ -0,0 +1,118 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py new file mode 100644 index 0000000..9b127bc --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py @@ -0,0 +1,126 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv6 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv7 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + feat6 = self.conv6(x) + feat7 = self.conv7(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py new file mode 100644 index 0000000..9b127bc --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py @@ -0,0 +1,126 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv6 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv7 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + feat6 = self.conv6(x) + feat7 = self.conv7(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py new file mode 100644 index 0000000..9b127bc --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py @@ -0,0 +1,126 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv6 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv7 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + feat6 = self.conv6(x) + feat7 = self.conv7(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_new.py b/infer/lib/uvr5_pack/lib_v5/layers_new.py new file mode 100644 index 0000000..44153b6 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_new.py @@ -0,0 +1,125 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) + + def __call__(self, x): + h = self.conv1(x) + h = self.conv2(h) + + return h + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + + h = self.conv1(x) + # h = self.conv2(h) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) + self.conv3 = Conv2DBNActiv( + nin, nout, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = Conv2DBNActiv( + nin, nout, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = Conv2DBNActiv( + nin, nout, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + out = self.bottleneck(out) + + if self.dropout is not None: + out = self.dropout(out) + + return out + + +class LSTMModule(nn.Module): + def __init__(self, nin_conv, nin_lstm, nout_lstm): + super(LSTMModule, self).__init__() + self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) + self.lstm = nn.LSTM( + input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True + ) + self.dense = nn.Sequential( + nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() + ) + + def forward(self, x): + N, _, nbins, nframes = x.size() + h = self.conv(x)[:, 0] # N, nbins, nframes + h = h.permute(2, 0, 1) # nframes, N, nbins + h, _ = self.lstm(h) + h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins + h = h.reshape(nframes, N, 1, nbins) + h = h.permute(1, 2, 3, 0) + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/model_param_init.py b/infer/lib/uvr5_pack/lib_v5/model_param_init.py new file mode 100644 index 0000000..b995c0b --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/model_param_init.py @@ -0,0 +1,69 @@ +import json +import os +import pathlib + +default_param = {} +default_param["bins"] = 768 +default_param["unstable_bins"] = 9 # training only +default_param["reduction_bins"] = 762 # training only +default_param["sr"] = 44100 +default_param["pre_filter_start"] = 757 +default_param["pre_filter_stop"] = 768 +default_param["band"] = {} + + +default_param["band"][1] = { + "sr": 11025, + "hl": 128, + "n_fft": 960, + "crop_start": 0, + "crop_stop": 245, + "lpf_start": 61, # inference only + "res_type": "polyphase", +} + +default_param["band"][2] = { + "sr": 44100, + "hl": 512, + "n_fft": 1536, + "crop_start": 24, + "crop_stop": 547, + "hpf_start": 81, # inference only + "res_type": "sinc_best", +} + + +def int_keys(d): + r = {} + for k, v in d: + if k.isdigit(): + k = int(k) + r[k] = v + return r + + +class ModelParameters(object): + def __init__(self, config_path=""): + if ".pth" == pathlib.Path(config_path).suffix: + import zipfile + + with zipfile.ZipFile(config_path, "r") as zip: + self.param = json.loads( + zip.read("param.json"), object_pairs_hook=int_keys + ) + elif ".json" == pathlib.Path(config_path).suffix: + with open(config_path, "r") as f: + self.param = json.loads(f.read(), object_pairs_hook=int_keys) + else: + self.param = default_param + + for k in [ + "mid_side", + "mid_side_b", + "mid_side_b2", + "stereo_w", + "stereo_n", + "reverse", + ]: + if not k in self.param: + self.param[k] = False diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json new file mode 100644 index 0000000..72cb449 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 16000, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 16000, + "pre_filter_start": 1023, + "pre_filter_stop": 1024 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json new file mode 100644 index 0000000..3c00ecf --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 32000, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "kaiser_fast" + } + }, + "sr": 32000, + "pre_filter_start": 1000, + "pre_filter_stop": 1021 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json new file mode 100644 index 0000000..55666ac --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 33075, + "hl": 384, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 33075, + "pre_filter_start": 1000, + "pre_filter_stop": 1021 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json new file mode 100644 index 0000000..665abe2 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 1024, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 1023, + "pre_filter_stop": 1024 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json new file mode 100644 index 0000000..0e8b16f --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json @@ -0,0 +1,19 @@ +{ + "bins": 256, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 256, + "n_fft": 512, + "crop_start": 0, + "crop_stop": 256, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 256, + "pre_filter_stop": 256 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json new file mode 100644 index 0000000..3b38fca --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 1023, + "pre_filter_stop": 1024 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json new file mode 100644 index 0000000..630df35 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 700, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 1023, + "pre_filter_stop": 700 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json new file mode 100644 index 0000000..ab9cf11 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json @@ -0,0 +1,30 @@ +{ + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 705, + "band": { + "1": { + "sr": 6000, + "hl": 66, + "n_fft": 512, + "crop_start": 0, + "crop_stop": 240, + "lpf_start": 60, + "lpf_stop": 118, + "res_type": "sinc_fastest" + }, + "2": { + "sr": 32000, + "hl": 352, + "n_fft": 1024, + "crop_start": 22, + "crop_stop": 505, + "hpf_start": 44, + "hpf_stop": 23, + "res_type": "sinc_medium" + } + }, + "sr": 32000, + "pre_filter_start": 710, + "pre_filter_stop": 731 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json new file mode 100644 index 0000000..7faa216 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json @@ -0,0 +1,30 @@ +{ + "bins": 512, + "unstable_bins": 7, + "reduction_bins": 510, + "band": { + "1": { + "sr": 11025, + "hl": 160, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 192, + "lpf_start": 41, + "lpf_stop": 139, + "res_type": "sinc_fastest" + }, + "2": { + "sr": 44100, + "hl": 640, + "n_fft": 1024, + "crop_start": 10, + "crop_stop": 320, + "hpf_start": 47, + "hpf_stop": 15, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 510, + "pre_filter_stop": 512 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json new file mode 100644 index 0000000..7e78175 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json @@ -0,0 +1,30 @@ +{ + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 705, + "band": { + "1": { + "sr": 6000, + "hl": 66, + "n_fft": 512, + "crop_start": 0, + "crop_stop": 240, + "lpf_start": 60, + "lpf_stop": 240, + "res_type": "sinc_fastest" + }, + "2": { + "sr": 48000, + "hl": 528, + "n_fft": 1536, + "crop_start": 22, + "crop_stop": 505, + "hpf_start": 82, + "hpf_stop": 22, + "res_type": "sinc_medium" + } + }, + "sr": 48000, + "pre_filter_start": 710, + "pre_filter_stop": 731 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json new file mode 100644 index 0000000..d881d76 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json @@ -0,0 +1,42 @@ +{ + "bins": 768, + "unstable_bins": 5, + "reduction_bins": 733, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 278, + "lpf_start": 28, + "lpf_stop": 140, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 256, + "n_fft": 768, + "crop_start": 14, + "crop_stop": 322, + "hpf_start": 70, + "hpf_stop": 14, + "lpf_start": 283, + "lpf_stop": 314, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 131, + "crop_stop": 313, + "hpf_start": 154, + "hpf_stop": 141, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 757, + "pre_filter_stop": 768 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json new file mode 100644 index 0000000..77ec198 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json @@ -0,0 +1,43 @@ +{ + "mid_side": true, + "bins": 768, + "unstable_bins": 5, + "reduction_bins": 733, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 278, + "lpf_start": 28, + "lpf_stop": 140, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 256, + "n_fft": 768, + "crop_start": 14, + "crop_stop": 322, + "hpf_start": 70, + "hpf_stop": 14, + "lpf_start": 283, + "lpf_stop": 314, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 131, + "crop_stop": 313, + "hpf_start": 154, + "hpf_stop": 141, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 757, + "pre_filter_stop": 768 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json new file mode 100644 index 0000000..85ee8a7 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json @@ -0,0 +1,43 @@ +{ + "mid_side_b2": true, + "bins": 640, + "unstable_bins": 7, + "reduction_bins": 565, + "band": { + "1": { + "sr": 11025, + "hl": 108, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 187, + "lpf_start": 92, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 216, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 212, + "hpf_start": 68, + "hpf_stop": 34, + "lpf_start": 174, + "lpf_stop": 209, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 432, + "n_fft": 640, + "crop_start": 66, + "crop_stop": 307, + "hpf_start": 86, + "hpf_stop": 72, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 639, + "pre_filter_stop": 640 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json new file mode 100644 index 0000000..df12375 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json @@ -0,0 +1,54 @@ +{ + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json new file mode 100644 index 0000000..e91b699 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json @@ -0,0 +1,55 @@ +{ + "bins": 768, + "unstable_bins": 7, + "mid_side": true, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json new file mode 100644 index 0000000..f852f28 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json @@ -0,0 +1,55 @@ +{ + "mid_side_b": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json new file mode 100644 index 0000000..f852f28 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json @@ -0,0 +1,55 @@ +{ + "mid_side_b": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json new file mode 100644 index 0000000..7a07d55 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json @@ -0,0 +1,55 @@ +{ + "reverse": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json new file mode 100644 index 0000000..ba0cf34 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json @@ -0,0 +1,55 @@ +{ + "stereo_w": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json new file mode 100644 index 0000000..33281a0 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json @@ -0,0 +1,54 @@ +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 637, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json new file mode 100644 index 0000000..2e5c770 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json @@ -0,0 +1,55 @@ +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 637, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "convert_channels": "stereo_n", + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json new file mode 100644 index 0000000..edb908b --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json @@ -0,0 +1,54 @@ +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 530, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json b/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json new file mode 100644 index 0000000..ee69beb --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json @@ -0,0 +1,43 @@ +{ + "mid_side_b2": true, + "bins": 1280, + "unstable_bins": 7, + "reduction_bins": 565, + "band": { + "1": { + "sr": 11025, + "hl": 108, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 374, + "lpf_start": 92, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 216, + "n_fft": 1536, + "crop_start": 0, + "crop_stop": 424, + "hpf_start": 68, + "hpf_stop": 34, + "lpf_start": 348, + "lpf_stop": 418, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 432, + "n_fft": 1280, + "crop_start": 132, + "crop_stop": 614, + "hpf_start": 172, + "hpf_stop": 144, + "res_type": "polyphase" + } + }, + "sr": 44100, + "pre_filter_start": 1280, + "pre_filter_stop": 1280 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/nets.py b/infer/lib/uvr5_pack/lib_v5/nets.py new file mode 100644 index 0000000..5da3948 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets.py @@ -0,0 +1,123 @@ +import layers +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 16) + self.stg1_high_band_net = BaseASPPNet(2, 16) + + self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(8, 16) + + self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(16, 32) + + self.out = nn.Conv2d(32, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py new file mode 100644 index 0000000..167d4cb --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py @@ -0,0 +1,122 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_123821KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 32) + self.stg1_high_band_net = BaseASPPNet(2, 32) + + self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(16, 32) + + self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(32, 64) + + self.out = nn.Conv2d(64, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py new file mode 100644 index 0000000..167d4cb --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py @@ -0,0 +1,122 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_123821KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 32) + self.stg1_high_band_net = BaseASPPNet(2, 32) + + self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(16, 32) + + self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(32, 64) + + self.out = nn.Conv2d(64, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py new file mode 100644 index 0000000..73a5b83 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py @@ -0,0 +1,122 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_33966KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16, 32)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 16) + self.stg1_high_band_net = BaseASPPNet(2, 16) + + self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(8, 16) + + self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(16, 32) + + self.out = nn.Conv2d(32, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py new file mode 100644 index 0000000..823b44f --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py @@ -0,0 +1,123 @@ +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_537238KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 64) + self.stg1_high_band_net = BaseASPPNet(2, 64) + + self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(32, 64) + + self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(64, 128) + + self.out = nn.Conv2d(128, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py new file mode 100644 index 0000000..823b44f --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py @@ -0,0 +1,123 @@ +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_537238KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 64) + self.stg1_high_band_net = BaseASPPNet(2, 64) + + self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(32, 64) + + self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(64, 128) + + self.out = nn.Conv2d(128, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py new file mode 100644 index 0000000..167d4cb --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py @@ -0,0 +1,122 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_123821KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 32) + self.stg1_high_band_net = BaseASPPNet(2, 32) + + self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(16, 32) + + self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(32, 64) + + self.out = nn.Conv2d(64, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_new.py b/infer/lib/uvr5_pack/lib_v5/nets_new.py new file mode 100644 index 0000000..1c0f4fa --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_new.py @@ -0,0 +1,133 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_new + + +class BaseNet(nn.Module): + def __init__( + self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) + ): + super(BaseNet, self).__init__() + self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) + self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) + self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1) + self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1) + self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1) + + self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) + + self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) + self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) + self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) + self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm) + self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) + + def __call__(self, x): + e1 = self.enc1(x) + e2 = self.enc2(e1) + e3 = self.enc3(e2) + e4 = self.enc4(e3) + e5 = self.enc5(e4) + + h = self.aspp(e5) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = torch.cat([h, self.lstm_dec2(h)], dim=1) + h = self.dec1(h, e1) + + return h + + +class CascadedNet(nn.Module): + def __init__(self, n_fft, nout=32, nout_lstm=128): + super(CascadedNet, self).__init__() + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + self.nin_lstm = self.max_bin // 2 + self.offset = 64 + + self.stg1_low_band_net = nn.Sequential( + BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), + layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), + ) + + self.stg1_high_band_net = BaseNet( + 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 + ) + + self.stg2_low_band_net = nn.Sequential( + BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), + layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), + ) + self.stg2_high_band_net = BaseNet( + nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 + ) + + self.stg3_full_band_net = BaseNet( + 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm + ) + + self.out = nn.Conv2d(nout, 2, 1, bias=False) + self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) + + def forward(self, x): + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + l1_in = x[:, :, :bandw] + h1_in = x[:, :, bandw:] + l1 = self.stg1_low_band_net(l1_in) + h1 = self.stg1_high_band_net(h1_in) + aux1 = torch.cat([l1, h1], dim=2) + + l2_in = torch.cat([l1_in, l1], dim=1) + h2_in = torch.cat([h1_in, h1], dim=1) + l2 = self.stg2_low_band_net(l2_in) + h2 = self.stg2_high_band_net(h2_in) + aux2 = torch.cat([l2, h2], dim=2) + + f3_in = torch.cat([x, aux1, aux2], dim=1) + f3 = self.stg3_full_band_net(f3_in) + + mask = torch.sigmoid(self.out(f3)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux = torch.cat([aux1, aux2], dim=1) + aux = torch.sigmoid(self.aux_out(aux)) + aux = F.pad( + input=aux, + pad=(0, 0, 0, self.output_bin - aux.size()[2]), + mode="replicate", + ) + return mask, aux + else: + return mask + + def predict_mask(self, x): + mask = self.forward(x) + + if self.offset > 0: + mask = mask[:, :, :, self.offset : -self.offset] + assert mask.size()[3] > 0 + + return mask + + def predict(self, x, aggressiveness=None): + mask = self.forward(x) + pred_mag = x * mask + + if self.offset > 0: + pred_mag = pred_mag[:, :, :, self.offset : -self.offset] + assert pred_mag.size()[3] > 0 + + return pred_mag diff --git a/infer/lib/uvr5_pack/lib_v5/spec_utils.py b/infer/lib/uvr5_pack/lib_v5/spec_utils.py new file mode 100644 index 0000000..a9634fd --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/spec_utils.py @@ -0,0 +1,672 @@ +import hashlib +import json +import math +import os + +import librosa +import numpy as np +import soundfile as sf +from tqdm import tqdm + + +def crop_center(h1, h2): + h1_shape = h1.size() + h2_shape = h2.size() + + if h1_shape[3] == h2_shape[3]: + return h1 + elif h1_shape[3] < h2_shape[3]: + raise ValueError("h1_shape[3] must be greater than h2_shape[3]") + + # s_freq = (h2_shape[2] - h1_shape[2]) // 2 + # e_freq = s_freq + h1_shape[2] + s_time = (h1_shape[3] - h2_shape[3]) // 2 + e_time = s_time + h2_shape[3] + h1 = h1[:, :, :, s_time:e_time] + + return h1 + + +def wave_to_spectrogram( + wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False +): + if reverse: + wave_left = np.flip(np.asfortranarray(wave[0])) + wave_right = np.flip(np.asfortranarray(wave[1])) + elif mid_side: + wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) + elif mid_side_b2: + wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) + else: + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + + spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length) + spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) + + spec = np.asfortranarray([spec_left, spec_right]) + + return spec + + +def wave_to_spectrogram_mt( + wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False +): + import threading + + if reverse: + wave_left = np.flip(np.asfortranarray(wave[0])) + wave_right = np.flip(np.asfortranarray(wave[1])) + elif mid_side: + wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) + elif mid_side_b2: + wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) + else: + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + + def run_thread(**kwargs): + global spec_left + spec_left = librosa.stft(**kwargs) + + thread = threading.Thread( + target=run_thread, + kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length}, + ) + thread.start() + spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) + thread.join() + + spec = np.asfortranarray([spec_left, spec_right]) + + return spec + + +def combine_spectrograms(specs, mp): + l = min([specs[i].shape[2] for i in specs]) + spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64) + offset = 0 + bands_n = len(mp.param["band"]) + + for d in range(1, bands_n + 1): + h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"] + spec_c[:, offset : offset + h, :l] = specs[d][ + :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l + ] + offset += h + + if offset > mp.param["bins"]: + raise ValueError("Too much bins") + + # lowpass fiter + if ( + mp.param["pre_filter_start"] > 0 + ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: + if bands_n == 1: + spec_c = fft_lp_filter( + spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"] + ) + else: + gp = 1 + for b in range( + mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"] + ): + g = math.pow( + 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0 + ) + gp = g + spec_c[:, b, :] *= g + + return np.asfortranarray(spec_c) + + +def spectrogram_to_image(spec, mode="magnitude"): + if mode == "magnitude": + if np.iscomplexobj(spec): + y = np.abs(spec) + else: + y = spec + y = np.log10(y**2 + 1e-8) + elif mode == "phase": + if np.iscomplexobj(spec): + y = np.angle(spec) + else: + y = spec + + y -= y.min() + y *= 255 / y.max() + img = np.uint8(y) + + if y.ndim == 3: + img = img.transpose(1, 2, 0) + img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2) + + return img + + +def reduce_vocal_aggressively(X, y, softmask): + v = X - y + y_mag_tmp = np.abs(y) + v_mag_tmp = np.abs(v) + + v_mask = v_mag_tmp > y_mag_tmp + y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf) + + return y_mag * np.exp(1.0j * np.angle(y)) + + +def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): + if min_range < fade_size * 2: + raise ValueError("min_range must be >= fade_area * 2") + + mag = mag.copy() + + idx = np.where(ref.mean(axis=(0, 1)) < thres)[0] + starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0]) + ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1]) + uninformative = np.where(ends - starts > min_range)[0] + if len(uninformative) > 0: + starts = starts[uninformative] + ends = ends[uninformative] + old_e = None + for s, e in zip(starts, ends): + if old_e is not None and s - old_e < fade_size: + s = old_e - fade_size * 2 + + if s != 0: + weight = np.linspace(0, 1, fade_size) + mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size] + else: + s -= fade_size + + if e != mag.shape[2]: + weight = np.linspace(1, 0, fade_size) + mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e] + else: + e += fade_size + + mag[:, :, s + fade_size : e - fade_size] += ref[ + :, :, s + fade_size : e - fade_size + ] + old_e = e + + return mag + + +def align_wave_head_and_tail(a, b): + l = min([a[0].size, b[0].size]) + + return a[:l, :l], b[:l, :l] + + +def cache_or_load(mix_path, inst_path, mp): + mix_basename = os.path.splitext(os.path.basename(mix_path))[0] + inst_basename = os.path.splitext(os.path.basename(inst_path))[0] + + cache_dir = "mph{}".format( + hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest() + ) + mix_cache_dir = os.path.join("cache", cache_dir) + inst_cache_dir = os.path.join("cache", cache_dir) + + os.makedirs(mix_cache_dir, exist_ok=True) + os.makedirs(inst_cache_dir, exist_ok=True) + + mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy") + inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy") + + if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path): + X_spec_m = np.load(mix_cache_path) + y_spec_m = np.load(inst_cache_path) + else: + X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} + + for d in range(len(mp.param["band"]), 0, -1): + bp = mp.param["band"][d] + + if d == len(mp.param["band"]): # high-end band + X_wave[d], _ = librosa.load( + mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"] + ) + y_wave[d], _ = librosa.load( + inst_path, + bp["sr"], + False, + dtype=np.float32, + res_type=bp["res_type"], + ) + else: # lower bands + X_wave[d] = librosa.resample( + X_wave[d + 1], + mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) + y_wave[d] = librosa.resample( + y_wave[d + 1], + mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) + + X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d]) + + X_spec_s[d] = wave_to_spectrogram( + X_wave[d], + bp["hl"], + bp["n_fft"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + y_spec_s[d] = wave_to_spectrogram( + y_wave[d], + bp["hl"], + bp["n_fft"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + + del X_wave, y_wave + + X_spec_m = combine_spectrograms(X_spec_s, mp) + y_spec_m = combine_spectrograms(y_spec_s, mp) + + if X_spec_m.shape != y_spec_m.shape: + raise ValueError("The combined spectrograms are different: " + mix_path) + + _, ext = os.path.splitext(mix_path) + + np.save(mix_cache_path, X_spec_m) + np.save(inst_cache_path, y_spec_m) + + return X_spec_m, y_spec_m + + +def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse): + spec_left = np.asfortranarray(spec[0]) + spec_right = np.asfortranarray(spec[1]) + + wave_left = librosa.istft(spec_left, hop_length=hop_length) + wave_right = librosa.istft(spec_right, hop_length=hop_length) + + if reverse: + return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) + elif mid_side: + return np.asfortranarray( + [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] + ) + elif mid_side_b2: + return np.asfortranarray( + [ + np.add(wave_right / 1.25, 0.4 * wave_left), + np.subtract(wave_left / 1.25, 0.4 * wave_right), + ] + ) + else: + return np.asfortranarray([wave_left, wave_right]) + + +def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): + import threading + + spec_left = np.asfortranarray(spec[0]) + spec_right = np.asfortranarray(spec[1]) + + def run_thread(**kwargs): + global wave_left + wave_left = librosa.istft(**kwargs) + + thread = threading.Thread( + target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length} + ) + thread.start() + wave_right = librosa.istft(spec_right, hop_length=hop_length) + thread.join() + + if reverse: + return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) + elif mid_side: + return np.asfortranarray( + [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] + ) + elif mid_side_b2: + return np.asfortranarray( + [ + np.add(wave_right / 1.25, 0.4 * wave_left), + np.subtract(wave_left / 1.25, 0.4 * wave_right), + ] + ) + else: + return np.asfortranarray([wave_left, wave_right]) + + +def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): + wave_band = {} + bands_n = len(mp.param["band"]) + offset = 0 + + for d in range(1, bands_n + 1): + bp = mp.param["band"][d] + spec_s = np.ndarray( + shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex + ) + h = bp["crop_stop"] - bp["crop_start"] + spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[ + :, offset : offset + h, : + ] + + offset += h + if d == bands_n: # higher + if extra_bins_h: # if --high_end_process bypass + max_bin = bp["n_fft"] // 2 + spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[ + :, :extra_bins_h, : + ] + if bp["hpf_start"] > 0: + spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) + if bands_n == 1: + wave = spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + else: + wave = np.add( + wave, + spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ), + ) + else: + sr = mp.param["band"][d + 1]["sr"] + if d == 1: # lower + spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) + wave = librosa.resample( + spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ), + bp["sr"], + sr, + res_type="sinc_fastest", + ) + else: # mid + spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) + spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) + wave2 = np.add( + wave, + spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ), + ) + # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest") + wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy") + + return wave.T + + +def fft_lp_filter(spec, bin_start, bin_stop): + g = 1.0 + for b in range(bin_start, bin_stop): + g -= 1 / (bin_stop - bin_start) + spec[:, b, :] = g * spec[:, b, :] + + spec[:, bin_stop:, :] *= 0 + + return spec + + +def fft_hp_filter(spec, bin_start, bin_stop): + g = 1.0 + for b in range(bin_start, bin_stop, -1): + g -= 1 / (bin_start - bin_stop) + spec[:, b, :] = g * spec[:, b, :] + + spec[:, 0 : bin_stop + 1, :] *= 0 + + return spec + + +def mirroring(a, spec_m, input_high_end, mp): + if "mirroring" == a: + mirror = np.flip( + np.abs( + spec_m[ + :, + mp.param["pre_filter_start"] + - 10 + - input_high_end.shape[1] : mp.param["pre_filter_start"] + - 10, + :, + ] + ), + 1, + ) + mirror = mirror * np.exp(1.0j * np.angle(input_high_end)) + + return np.where( + np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror + ) + + if "mirroring2" == a: + mirror = np.flip( + np.abs( + spec_m[ + :, + mp.param["pre_filter_start"] + - 10 + - input_high_end.shape[1] : mp.param["pre_filter_start"] + - 10, + :, + ] + ), + 1, + ) + mi = np.multiply(mirror, input_high_end * 1.7) + + return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) + + +def ensembling(a, specs): + for i in range(1, len(specs)): + if i == 1: + spec = specs[0] + + ln = min([spec.shape[2], specs[i].shape[2]]) + spec = spec[:, :, :ln] + specs[i] = specs[i][:, :, :ln] + + if "min_mag" == a: + spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec) + if "max_mag" == a: + spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec) + + return spec + + +def stft(wave, nfft, hl): + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + spec_left = librosa.stft(wave_left, nfft, hop_length=hl) + spec_right = librosa.stft(wave_right, nfft, hop_length=hl) + spec = np.asfortranarray([spec_left, spec_right]) + + return spec + + +def istft(spec, hl): + spec_left = np.asfortranarray(spec[0]) + spec_right = np.asfortranarray(spec[1]) + + wave_left = librosa.istft(spec_left, hop_length=hl) + wave_right = librosa.istft(spec_right, hop_length=hl) + wave = np.asfortranarray([wave_left, wave_right]) + + +if __name__ == "__main__": + import argparse + import sys + import time + + import cv2 + from model_param_init import ModelParameters + + p = argparse.ArgumentParser() + p.add_argument( + "--algorithm", + "-a", + type=str, + choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"], + default="min_mag", + ) + p.add_argument( + "--model_params", + "-m", + type=str, + default=os.path.join("modelparams", "1band_sr44100_hl512.json"), + ) + p.add_argument("--output_name", "-o", type=str, default="output") + p.add_argument("--vocals_only", "-v", action="store_true") + p.add_argument("input", nargs="+") + args = p.parse_args() + + start_time = time.time() + + if args.algorithm.startswith("invert") and len(args.input) != 2: + raise ValueError("There should be two input files.") + + if not args.algorithm.startswith("invert") and len(args.input) < 2: + raise ValueError("There must be at least two input files.") + + wave, specs = {}, {} + mp = ModelParameters(args.model_params) + + for i in range(len(args.input)): + spec = {} + + for d in range(len(mp.param["band"]), 0, -1): + bp = mp.param["band"][d] + + if d == len(mp.param["band"]): # high-end band + wave[d], _ = librosa.load( + args.input[i], + bp["sr"], + False, + dtype=np.float32, + res_type=bp["res_type"], + ) + + if len(wave[d].shape) == 1: # mono to stereo + wave[d] = np.array([wave[d], wave[d]]) + else: # lower bands + wave[d] = librosa.resample( + wave[d + 1], + mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) + + spec[d] = wave_to_spectrogram( + wave[d], + bp["hl"], + bp["n_fft"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + + specs[i] = combine_spectrograms(spec, mp) + + del wave + + if args.algorithm == "deep": + d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1]) + v_spec = d_spec - specs[1] + sf.write( + os.path.join("{}.wav".format(args.output_name)), + cmb_spectrogram_to_wave(v_spec, mp), + mp.param["sr"], + ) + + if args.algorithm.startswith("invert"): + ln = min([specs[0].shape[2], specs[1].shape[2]]) + specs[0] = specs[0][:, :, :ln] + specs[1] = specs[1][:, :, :ln] + + if "invert_p" == args.algorithm: + X_mag = np.abs(specs[0]) + y_mag = np.abs(specs[1]) + max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) + v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0])) + else: + specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) + v_spec = specs[0] - specs[1] + + if not args.vocals_only: + X_mag = np.abs(specs[0]) + y_mag = np.abs(specs[1]) + v_mag = np.abs(v_spec) + + X_image = spectrogram_to_image(X_mag) + y_image = spectrogram_to_image(y_mag) + v_image = spectrogram_to_image(v_mag) + + cv2.imwrite("{}_X.png".format(args.output_name), X_image) + cv2.imwrite("{}_y.png".format(args.output_name), y_image) + cv2.imwrite("{}_v.png".format(args.output_name), v_image) + + sf.write( + "{}_X.wav".format(args.output_name), + cmb_spectrogram_to_wave(specs[0], mp), + mp.param["sr"], + ) + sf.write( + "{}_y.wav".format(args.output_name), + cmb_spectrogram_to_wave(specs[1], mp), + mp.param["sr"], + ) + + sf.write( + "{}_v.wav".format(args.output_name), + cmb_spectrogram_to_wave(v_spec, mp), + mp.param["sr"], + ) + else: + if not args.algorithm == "deep": + sf.write( + os.path.join("ensembled", "{}.wav".format(args.output_name)), + cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), + mp.param["sr"], + ) + + if args.algorithm == "align": + trackalignment = [ + { + "file1": '"{}"'.format(args.input[0]), + "file2": '"{}"'.format(args.input[1]), + } + ] + + for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."): + os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}") + + # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1)) diff --git a/infer/lib/uvr5_pack/name_params.json b/infer/lib/uvr5_pack/name_params.json new file mode 100644 index 0000000..8ed51a6 --- /dev/null +++ b/infer/lib/uvr5_pack/name_params.json @@ -0,0 +1,263 @@ +{ + "equivalent" : [ + { + "model_hash_name" : [ + { + "hash_name": "47939caf0cfe52a0e81442b85b971dfd", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "param_name": "4band_v2" + }, + { + "hash_name": "ca106edd563e034bde0bdec4bb7a4b36", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "param_name": "4band_v2" + }, + { + "hash_name": "e60a1e84803ce4efc0a6551206cc4b71", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "a82f14e75892e55e994376edbf0c8435", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "param_name": "4band_v2_sn" + }, + { + "hash_name": "08611fb99bd59eaa79ad27c58d137727", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "param_name": "4band_v2_sn" + }, + { + "hash_name": "5c7bbca45a187e81abbbd351606164e5", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "param_name": "3band_44100_msb2" + }, + { + "hash_name": "d6b2cb685a058a091e5e7098192d3233", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "param_name": "3band_44100_msb2" + }, + { + "hash_name": "c1b9f38170a7c90e96f027992eb7c62b", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "c3448ec923fa0edf3d03a19e633faa53", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "68aa2c8093d0080704b200d140f59e54", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", + "param_name": "3band_44100" + }, + { + "hash_name": "fdc83be5b798e4bd29fe00fe6600e147", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "param_name": "3band_44100_mid.json" + }, + { + "hash_name": "2ce34bc92fd57f55db16b7a4def3d745", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "param_name": "3band_44100_mid.json" + }, + { + "hash_name": "52fdca89576f06cf4340b74a4730ee5f", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100.json" + }, + { + "hash_name": "41191165b05d38fc77f072fa9e8e8a30", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100.json" + }, + { + "hash_name": "89e83b511ad474592689e562d5b1f80e", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "param_name": "2band_32000.json" + }, + { + "hash_name": "0b954da81d453b716b114d6d7c95177f", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "param_name": "2band_32000.json" + } + + ], + "v4 Models": [ + { + "hash_name": "6a00461c51c2920fd68937d4609ed6c8", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "param_name": "1band_sr16000_hl512" + }, + { + "hash_name": "0ab504864d20f1bd378fe9c81ef37140", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr32000_hl512" + }, + { + "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr32000_hl512" + }, + { + "hash_name": "80ab74d65e515caa3622728d2de07d23", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr32000_hl512" + }, + { + "hash_name": "edc115e7fc523245062200c00caa847f", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "param_name": "1band_sr33075_hl384" + }, + { + "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "param_name": "1band_sr33075_hl384" + }, + { + "hash_name": "b58090534c52cbc3e9b5104bad666ef2", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "param_name": "1band_sr44100_hl512" + }, + { + "hash_name": "0cdab9947f1b0928705f518f3c78ea8f", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "param_name": "1band_sr44100_hl512" + }, + { + "hash_name": "ae702fed0238afb5346db8356fe25f13", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "param_name": "1band_sr44100_hl1024" + } + ] + } + ], + "User Models" : [ + { + "1 Band": [ + { + "hash_name": "1band_sr16000_hl512", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "param_name": "1band_sr16000_hl512" + }, + { + "hash_name": "1band_sr32000_hl512", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr16000_hl512" + }, + { + "hash_name": "1band_sr33075_hl384", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "param_name": "1band_sr33075_hl384" + }, + { + "hash_name": "1band_sr44100_hl256", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json", + "param_name": "1band_sr44100_hl256" + }, + { + "hash_name": "1band_sr44100_hl512", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "param_name": "1band_sr44100_hl512" + }, + { + "hash_name": "1band_sr44100_hl1024", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "param_name": "1band_sr44100_hl1024" + } + ], + "2 Band": [ + { + "hash_name": "2band_44100_lofi", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json", + "param_name": "2band_44100_lofi" + }, + { + "hash_name": "2band_32000", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "param_name": "2band_32000" + }, + { + "hash_name": "2band_48000", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json", + "param_name": "2band_48000" + } + ], + "3 Band": [ + { + "hash_name": "3band_44100", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", + "param_name": "3band_44100" + }, + { + "hash_name": "3band_44100_mid", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "param_name": "3band_44100_mid" + }, + { + "hash_name": "3band_44100_msb2", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "param_name": "3band_44100_msb2" + } + ], + "4 Band": [ + { + "hash_name": "4band_44100", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "4band_44100_mid", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json", + "param_name": "4band_44100_mid" + }, + { + "hash_name": "4band_44100_msb", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json", + "param_name": "4band_44100_msb" + }, + { + "hash_name": "4band_44100_msb2", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json", + "param_name": "4band_44100_msb2" + }, + { + "hash_name": "4band_44100_reverse", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json", + "param_name": "4band_44100_reverse" + }, + { + "hash_name": "4band_44100_sw", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json", + "param_name": "4band_44100_sw" + }, + { + "hash_name": "4band_v2", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "param_name": "4band_v2" + }, + { + "hash_name": "4band_v2_sn", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "param_name": "4band_v2_sn" + }, + { + "hash_name": "tmodelparam", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json", + "param_name": "User Model Param Set" + } + ] + } + ] +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/utils.py b/infer/lib/uvr5_pack/utils.py new file mode 100644 index 0000000..f4805cd --- /dev/null +++ b/infer/lib/uvr5_pack/utils.py @@ -0,0 +1,121 @@ +import json + +import numpy as np +import torch +from tqdm import tqdm + + +def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict: + with open(file_name, "r") as f: + data = json.load(f) + + return data + + +def make_padding(width, cropsize, offset): + left = offset + roi_size = cropsize - left * 2 + if roi_size == 0: + roi_size = cropsize + right = roi_size - (width % roi_size) + left + + return left, right, roi_size + + +def inference(X_spec, device, model, aggressiveness, data): + """ + data : dic configs + """ + + def _execute( + X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True + ): + model.eval() + with torch.no_grad(): + preds = [] + + iterations = [n_window] + + total_iterations = sum(iterations) + for i in tqdm(range(n_window)): + start = i * roi_size + X_mag_window = X_mag_pad[ + None, :, :, start : start + data["window_size"] + ] + X_mag_window = torch.from_numpy(X_mag_window) + if is_half: + X_mag_window = X_mag_window.half() + X_mag_window = X_mag_window.to(device) + + pred = model.predict(X_mag_window, aggressiveness) + + pred = pred.detach().cpu().numpy() + preds.append(pred[0]) + + pred = np.concatenate(preds, axis=2) + return pred + + def preprocess(X_spec): + X_mag = np.abs(X_spec) + X_phase = np.angle(X_spec) + + return X_mag, X_phase + + X_mag, X_phase = preprocess(X_spec) + + coef = X_mag.max() + X_mag_pre = X_mag / coef + + n_frame = X_mag_pre.shape[2] + pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) + n_window = int(np.ceil(n_frame / roi_size)) + + X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") + + if list(model.state_dict().values())[0].dtype == torch.float16: + is_half = True + else: + is_half = False + pred = _execute( + X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half + ) + pred = pred[:, :, :n_frame] + + if data["tta"]: + pad_l += roi_size // 2 + pad_r += roi_size // 2 + n_window += 1 + + X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") + + pred_tta = _execute( + X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half + ) + pred_tta = pred_tta[:, :, roi_size // 2 :] + pred_tta = pred_tta[:, :, :n_frame] + + return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) + else: + return pred * coef, X_mag, np.exp(1.0j * X_phase) + + +def _get_name_params(model_path, model_hash): + data = load_data() + flag = False + ModelName = model_path + for type in list(data): + for model in list(data[type][0]): + for i in range(len(data[type][0][model])): + if str(data[type][0][model][i]["hash_name"]) == model_hash: + flag = True + elif str(data[type][0][model][i]["hash_name"]) in ModelName: + flag = True + + if flag: + model_params_auto = data[type][0][model][i]["model_params"] + param_name_auto = data[type][0][model][i]["param_name"] + if type == "equivalent": + return param_name_auto, model_params_auto + else: + flag = False + return param_name_auto, model_params_auto diff --git a/infer/modules/ipex/__init__.py b/infer/modules/ipex/__init__.py new file mode 100644 index 0000000..cd27bc1 --- /dev/null +++ b/infer/modules/ipex/__init__.py @@ -0,0 +1,190 @@ +import os +import sys +import contextlib +import torch +import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import +from .hijacks import ipex_hijacks +from .attention import attention_init + +# pylint: disable=protected-access, missing-function-docstring, line-too-long + + +def ipex_init(): # pylint: disable=too-many-statements + try: + # Replace cuda with xpu: + torch.cuda.current_device = torch.xpu.current_device + torch.cuda.current_stream = torch.xpu.current_stream + torch.cuda.device = torch.xpu.device + torch.cuda.device_count = torch.xpu.device_count + torch.cuda.device_of = torch.xpu.device_of + torch.cuda.get_device_name = torch.xpu.get_device_name + torch.cuda.get_device_properties = torch.xpu.get_device_properties + torch.cuda.init = torch.xpu.init + torch.cuda.is_available = torch.xpu.is_available + torch.cuda.is_initialized = torch.xpu.is_initialized + torch.cuda.is_current_stream_capturing = lambda: False + torch.cuda.set_device = torch.xpu.set_device + torch.cuda.stream = torch.xpu.stream + torch.cuda.synchronize = torch.xpu.synchronize + torch.cuda.Event = torch.xpu.Event + torch.cuda.Stream = torch.xpu.Stream + torch.cuda.FloatTensor = torch.xpu.FloatTensor + torch.Tensor.cuda = torch.Tensor.xpu + torch.Tensor.is_cuda = torch.Tensor.is_xpu + torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock + torch.cuda._initialized = torch.xpu.lazy_init._initialized + torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker + torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls + torch.cuda._tls = torch.xpu.lazy_init._tls + torch.cuda.threading = torch.xpu.lazy_init.threading + torch.cuda.traceback = torch.xpu.lazy_init.traceback + torch.cuda.Optional = torch.xpu.Optional + torch.cuda.__cached__ = torch.xpu.__cached__ + torch.cuda.__loader__ = torch.xpu.__loader__ + torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage + torch.cuda.Tuple = torch.xpu.Tuple + torch.cuda.streams = torch.xpu.streams + torch.cuda._lazy_new = torch.xpu._lazy_new + torch.cuda.FloatStorage = torch.xpu.FloatStorage + torch.cuda.Any = torch.xpu.Any + torch.cuda.__doc__ = torch.xpu.__doc__ + torch.cuda.default_generators = torch.xpu.default_generators + torch.cuda.HalfTensor = torch.xpu.HalfTensor + torch.cuda._get_device_index = torch.xpu._get_device_index + torch.cuda.__path__ = torch.xpu.__path__ + torch.cuda.Device = torch.xpu.Device + torch.cuda.IntTensor = torch.xpu.IntTensor + torch.cuda.ByteStorage = torch.xpu.ByteStorage + torch.cuda.set_stream = torch.xpu.set_stream + torch.cuda.BoolStorage = torch.xpu.BoolStorage + torch.cuda.os = torch.xpu.os + torch.cuda.torch = torch.xpu.torch + torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage + torch.cuda.Union = torch.xpu.Union + torch.cuda.DoubleTensor = torch.xpu.DoubleTensor + torch.cuda.ShortTensor = torch.xpu.ShortTensor + torch.cuda.LongTensor = torch.xpu.LongTensor + torch.cuda.IntStorage = torch.xpu.IntStorage + torch.cuda.LongStorage = torch.xpu.LongStorage + torch.cuda.__annotations__ = torch.xpu.__annotations__ + torch.cuda.__package__ = torch.xpu.__package__ + torch.cuda.__builtins__ = torch.xpu.__builtins__ + torch.cuda.CharTensor = torch.xpu.CharTensor + torch.cuda.List = torch.xpu.List + torch.cuda._lazy_init = torch.xpu._lazy_init + torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor + torch.cuda.DoubleStorage = torch.xpu.DoubleStorage + torch.cuda.ByteTensor = torch.xpu.ByteTensor + torch.cuda.StreamContext = torch.xpu.StreamContext + torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage + torch.cuda.ShortStorage = torch.xpu.ShortStorage + torch.cuda._lazy_call = torch.xpu._lazy_call + torch.cuda.HalfStorage = torch.xpu.HalfStorage + torch.cuda.random = torch.xpu.random + torch.cuda._device = torch.xpu._device + torch.cuda.classproperty = torch.xpu.classproperty + torch.cuda.__name__ = torch.xpu.__name__ + torch.cuda._device_t = torch.xpu._device_t + torch.cuda.warnings = torch.xpu.warnings + torch.cuda.__spec__ = torch.xpu.__spec__ + torch.cuda.BoolTensor = torch.xpu.BoolTensor + torch.cuda.CharStorage = torch.xpu.CharStorage + torch.cuda.__file__ = torch.xpu.__file__ + torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork + # torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing + + # Memory: + torch.cuda.memory = torch.xpu.memory + if "linux" in sys.platform and "WSL2" in os.popen("uname -a").read(): + torch.xpu.empty_cache = lambda: None + torch.cuda.empty_cache = torch.xpu.empty_cache + torch.cuda.memory_stats = torch.xpu.memory_stats + torch.cuda.memory_summary = torch.xpu.memory_summary + torch.cuda.memory_snapshot = torch.xpu.memory_snapshot + torch.cuda.memory_allocated = torch.xpu.memory_allocated + torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated + torch.cuda.memory_reserved = torch.xpu.memory_reserved + torch.cuda.memory_cached = torch.xpu.memory_reserved + torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved + torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved + torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats + torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats + torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats + torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict + torch.cuda.reset_accumulated_memory_stats = ( + torch.xpu.reset_accumulated_memory_stats + ) + + # RNG: + torch.cuda.get_rng_state = torch.xpu.get_rng_state + torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all + torch.cuda.set_rng_state = torch.xpu.set_rng_state + torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all + torch.cuda.manual_seed = torch.xpu.manual_seed + torch.cuda.manual_seed_all = torch.xpu.manual_seed_all + torch.cuda.seed = torch.xpu.seed + torch.cuda.seed_all = torch.xpu.seed_all + torch.cuda.initial_seed = torch.xpu.initial_seed + + # AMP: + torch.cuda.amp = torch.xpu.amp + if not hasattr(torch.cuda.amp, "common"): + torch.cuda.amp.common = contextlib.nullcontext() + torch.cuda.amp.common.amp_definitely_not_available = lambda: False + try: + torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler + except Exception: # pylint: disable=broad-exception-caught + try: + from .gradscaler import ( + gradscaler_init, + ) # pylint: disable=import-outside-toplevel, import-error + + gradscaler_init() + torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler + except Exception: # pylint: disable=broad-exception-caught + torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler + + # C + torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream + ipex._C._DeviceProperties.major = 2023 + ipex._C._DeviceProperties.minor = 2 + + # Fix functions with ipex: + torch.cuda.mem_get_info = lambda device=None: [ + ( + torch.xpu.get_device_properties(device).total_memory + - torch.xpu.memory_allocated(device) + ), + torch.xpu.get_device_properties(device).total_memory, + ] + torch._utils._get_available_device_type = lambda: "xpu" + torch.has_cuda = True + torch.cuda.has_half = True + torch.cuda.is_bf16_supported = lambda *args, **kwargs: True + torch.cuda.is_fp16_supported = lambda *args, **kwargs: True + torch.version.cuda = "11.7" + torch.cuda.get_device_capability = lambda *args, **kwargs: [11, 7] + torch.cuda.get_device_properties.major = 11 + torch.cuda.get_device_properties.minor = 7 + torch.cuda.ipc_collect = lambda *args, **kwargs: None + torch.cuda.utilization = lambda *args, **kwargs: 0 + if hasattr(torch.xpu, "getDeviceIdListForCard"): + torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard + torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard + else: + torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card + torch.cuda.get_device_id_list_per_card = ( + torch.xpu.get_device_id_list_per_card + ) + + ipex_hijacks() + attention_init() + try: + from .diffusers import ipex_diffusers + + ipex_diffusers() + except Exception: # pylint: disable=broad-exception-caught + pass + except Exception as e: + return False, e + return True, None diff --git a/infer/modules/ipex/attention.py b/infer/modules/ipex/attention.py new file mode 100644 index 0000000..0cc2803 --- /dev/null +++ b/infer/modules/ipex/attention.py @@ -0,0 +1,212 @@ +import torch +import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import + +# pylint: disable=protected-access, missing-function-docstring, line-too-long + +original_torch_bmm = torch.bmm + + +def torch_bmm(input, mat2, *, out=None): + if input.dtype != mat2.dtype: + mat2 = mat2.to(input.dtype) + + # ARC GPUs can't allocate more than 4GB to a single block, Slice it: + batch_size_attention, input_tokens, mat2_shape = ( + input.shape[0], + input.shape[1], + mat2.shape[2], + ) + block_multiply = input.element_size() + slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply + block_size = batch_size_attention * slice_block_size + + split_slice_size = batch_size_attention + if block_size > 4: + do_split = True + # Find something divisible with the input_tokens + while (split_slice_size * slice_block_size) > 4: + split_slice_size = split_slice_size // 2 + if split_slice_size <= 1: + split_slice_size = 1 + break + else: + do_split = False + + split_2_slice_size = input_tokens + if split_slice_size * slice_block_size > 4: + slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply + do_split_2 = True + # Find something divisible with the input_tokens + while (split_2_slice_size * slice_block_size2) > 4: + split_2_slice_size = split_2_slice_size // 2 + if split_2_slice_size <= 1: + split_2_slice_size = 1 + break + else: + do_split_2 = False + + if do_split: + hidden_states = torch.zeros( + input.shape[0], + input.shape[1], + mat2.shape[2], + device=input.device, + dtype=input.dtype, + ) + for i in range(batch_size_attention // split_slice_size): + start_idx = i * split_slice_size + end_idx = (i + 1) * split_slice_size + if do_split_2: + for i2 in range( + input_tokens // split_2_slice_size + ): # pylint: disable=invalid-name + start_idx_2 = i2 * split_2_slice_size + end_idx_2 = (i2 + 1) * split_2_slice_size + hidden_states[ + start_idx:end_idx, start_idx_2:end_idx_2 + ] = original_torch_bmm( + input[start_idx:end_idx, start_idx_2:end_idx_2], + mat2[start_idx:end_idx, start_idx_2:end_idx_2], + out=out, + ) + else: + hidden_states[start_idx:end_idx] = original_torch_bmm( + input[start_idx:end_idx], mat2[start_idx:end_idx], out=out + ) + else: + return original_torch_bmm(input, mat2, out=out) + return hidden_states + + +original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention + + +def scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False +): + # ARC GPUs can't allocate more than 4GB to a single block, Slice it: + if len(query.shape) == 3: + batch_size_attention, query_tokens, shape_four = query.shape + shape_one = 1 + no_shape_one = True + else: + shape_one, batch_size_attention, query_tokens, shape_four = query.shape + no_shape_one = False + + block_multiply = query.element_size() + slice_block_size = ( + shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply + ) + block_size = batch_size_attention * slice_block_size + + split_slice_size = batch_size_attention + if block_size > 4: + do_split = True + # Find something divisible with the shape_one + while (split_slice_size * slice_block_size) > 4: + split_slice_size = split_slice_size // 2 + if split_slice_size <= 1: + split_slice_size = 1 + break + else: + do_split = False + + split_2_slice_size = query_tokens + if split_slice_size * slice_block_size > 4: + slice_block_size2 = ( + shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply + ) + do_split_2 = True + # Find something divisible with the batch_size_attention + while (split_2_slice_size * slice_block_size2) > 4: + split_2_slice_size = split_2_slice_size // 2 + if split_2_slice_size <= 1: + split_2_slice_size = 1 + break + else: + do_split_2 = False + + if do_split: + hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype) + for i in range(batch_size_attention // split_slice_size): + start_idx = i * split_slice_size + end_idx = (i + 1) * split_slice_size + if do_split_2: + for i2 in range( + query_tokens // split_2_slice_size + ): # pylint: disable=invalid-name + start_idx_2 = i2 * split_2_slice_size + end_idx_2 = (i2 + 1) * split_2_slice_size + if no_shape_one: + hidden_states[ + start_idx:end_idx, start_idx_2:end_idx_2 + ] = original_scaled_dot_product_attention( + query[start_idx:end_idx, start_idx_2:end_idx_2], + key[start_idx:end_idx, start_idx_2:end_idx_2], + value[start_idx:end_idx, start_idx_2:end_idx_2], + attn_mask=attn_mask[ + start_idx:end_idx, start_idx_2:end_idx_2 + ] + if attn_mask is not None + else attn_mask, + dropout_p=dropout_p, + is_causal=is_causal, + ) + else: + hidden_states[ + :, start_idx:end_idx, start_idx_2:end_idx_2 + ] = original_scaled_dot_product_attention( + query[:, start_idx:end_idx, start_idx_2:end_idx_2], + key[:, start_idx:end_idx, start_idx_2:end_idx_2], + value[:, start_idx:end_idx, start_idx_2:end_idx_2], + attn_mask=attn_mask[ + :, start_idx:end_idx, start_idx_2:end_idx_2 + ] + if attn_mask is not None + else attn_mask, + dropout_p=dropout_p, + is_causal=is_causal, + ) + else: + if no_shape_one: + hidden_states[ + start_idx:end_idx + ] = original_scaled_dot_product_attention( + query[start_idx:end_idx], + key[start_idx:end_idx], + value[start_idx:end_idx], + attn_mask=attn_mask[start_idx:end_idx] + if attn_mask is not None + else attn_mask, + dropout_p=dropout_p, + is_causal=is_causal, + ) + else: + hidden_states[ + :, start_idx:end_idx + ] = original_scaled_dot_product_attention( + query[:, start_idx:end_idx], + key[:, start_idx:end_idx], + value[:, start_idx:end_idx], + attn_mask=attn_mask[:, start_idx:end_idx] + if attn_mask is not None + else attn_mask, + dropout_p=dropout_p, + is_causal=is_causal, + ) + else: + return original_scaled_dot_product_attention( + query, + key, + value, + attn_mask=attn_mask, + dropout_p=dropout_p, + is_causal=is_causal, + ) + return hidden_states + + +def attention_init(): + # ARC GPUs can't allocate more than 4GB to a single block: + torch.bmm = torch_bmm + torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention diff --git a/infer/modules/ipex/gradscaler.py b/infer/modules/ipex/gradscaler.py new file mode 100644 index 0000000..7875151 --- /dev/null +++ b/infer/modules/ipex/gradscaler.py @@ -0,0 +1,187 @@ +from collections import defaultdict +import torch +import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import +import intel_extension_for_pytorch._C as core # pylint: disable=import-error, unused-import + +# pylint: disable=protected-access, missing-function-docstring, line-too-long + +OptState = ipex.cpu.autocast._grad_scaler.OptState +_MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator +_refresh_per_optimizer_state = ( + ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state +) + + +def _unscale_grads_( + self, optimizer, inv_scale, found_inf, allow_fp16 +): # pylint: disable=unused-argument + per_device_inv_scale = _MultiDeviceReplicator(inv_scale) + per_device_found_inf = _MultiDeviceReplicator(found_inf) + + # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype. + # There could be hundreds of grads, so we'd like to iterate through them just once. + # However, we don't know their devices or dtypes in advance. + + # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict + # Google says mypy struggles with defaultdicts type annotations. + per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list)) # type: ignore[var-annotated] + # sync grad to master weight + if hasattr(optimizer, "sync_grad"): + optimizer.sync_grad() + with torch.no_grad(): + for group in optimizer.param_groups: + for param in group["params"]: + if param.grad is None: + continue + if (not allow_fp16) and param.grad.dtype == torch.float16: + raise ValueError("Attempting to unscale FP16 gradients.") + if param.grad.is_sparse: + # is_coalesced() == False means the sparse grad has values with duplicate indices. + # coalesce() deduplicates indices and adds all values that have the same index. + # For scaled fp16 values, there's a good chance coalescing will cause overflow, + # so we should check the coalesced _values(). + if param.grad.dtype is torch.float16: + param.grad = param.grad.coalesce() + to_unscale = param.grad._values() + else: + to_unscale = param.grad + + # -: is there a way to split by device and dtype without appending in the inner loop? + to_unscale = to_unscale.to("cpu") + per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append( + to_unscale + ) + + for _, per_dtype_grads in per_device_and_dtype_grads.items(): + for grads in per_dtype_grads.values(): + core._amp_foreach_non_finite_check_and_unscale_( + grads, + per_device_found_inf.get("cpu"), + per_device_inv_scale.get("cpu"), + ) + + return per_device_found_inf._per_device_tensors + + +def unscale_(self, optimizer): + """ + Divides ("unscales") the optimizer's gradient tensors by the scale factor. + :meth:`unscale_` is optional, serving cases where you need to + :ref:`modify or inspect gradients` + between the backward pass(es) and :meth:`step`. + If :meth:`unscale_` is not called explicitly, gradients will be unscaled automatically during :meth:`step`. + Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients:: + ... + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) + scaler.step(optimizer) + scaler.update() + Args: + optimizer (torch.optim.Optimizer): Optimizer that owns the gradients to be unscaled. + .. warning:: + :meth:`unscale_` should only be called once per optimizer per :meth:`step` call, + and only after all gradients for that optimizer's assigned parameters have been accumulated. + Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError. + .. warning:: + :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute. + """ + if not self._enabled: + return + + self._check_scale_growth_tracker("unscale_") + + optimizer_state = self._per_optimizer_states[id(optimizer)] + + if optimizer_state["stage"] is OptState.UNSCALED: # pylint: disable=no-else-raise + raise RuntimeError( + "unscale_() has already been called on this optimizer since the last update()." + ) + elif optimizer_state["stage"] is OptState.STEPPED: + raise RuntimeError("unscale_() is being called after step().") + + # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64. + assert self._scale is not None + inv_scale = ( + self._scale.to("cpu").double().reciprocal().float().to(self._scale.device) + ) + found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device) + + optimizer_state["found_inf_per_device"] = self._unscale_grads_( + optimizer, inv_scale, found_inf, False + ) + optimizer_state["stage"] = OptState.UNSCALED + + +def update(self, new_scale=None): + """ + Updates the scale factor. + If any optimizer steps were skipped the scale is multiplied by ``backoff_factor`` + to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively, + the scale is multiplied by ``growth_factor`` to increase it. + Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not + used directly, it's used to fill GradScaler's internal scale tensor. So if + ``new_scale`` was a tensor, later in-place changes to that tensor will not further + affect the scale GradScaler uses internally.) + Args: + new_scale (float or :class:`torch.FloatTensor`, optional, default=None): New scale factor. + .. warning:: + :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has + been invoked for all optimizers used this iteration. + """ + if not self._enabled: + return + + _scale, _growth_tracker = self._check_scale_growth_tracker("update") + + if new_scale is not None: + # Accept a new user-defined scale. + if isinstance(new_scale, float): + self._scale.fill_(new_scale) # type: ignore[union-attr] + else: + reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False." + assert isinstance(new_scale, torch.FloatTensor), reason # type: ignore[attr-defined] + assert new_scale.numel() == 1, reason + assert new_scale.requires_grad is False, reason + self._scale.copy_(new_scale) # type: ignore[union-attr] + else: + # Consume shared inf/nan data collected from optimizers to update the scale. + # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous. + found_infs = [ + found_inf.to(device="cpu", non_blocking=True) + for state in self._per_optimizer_states.values() + for found_inf in state["found_inf_per_device"].values() + ] + + assert len(found_infs) > 0, "No inf checks were recorded prior to update." + + found_inf_combined = found_infs[0] + if len(found_infs) > 1: + for i in range(1, len(found_infs)): + found_inf_combined += found_infs[i] + + to_device = _scale.device + _scale = _scale.to("cpu") + _growth_tracker = _growth_tracker.to("cpu") + + core._amp_update_scale_( + _scale, + _growth_tracker, + found_inf_combined, + self._growth_factor, + self._backoff_factor, + self._growth_interval, + ) + + _scale = _scale.to(to_device) + _growth_tracker = _growth_tracker.to(to_device) + # To prepare for next iteration, clear the data collected from optimizers this iteration. + self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state) + + +def gradscaler_init(): + torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler + torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_ + torch.xpu.amp.GradScaler.unscale_ = unscale_ + torch.xpu.amp.GradScaler.update = update + return torch.xpu.amp.GradScaler diff --git a/infer/modules/ipex/hijacks.py b/infer/modules/ipex/hijacks.py new file mode 100644 index 0000000..d95fd61 --- /dev/null +++ b/infer/modules/ipex/hijacks.py @@ -0,0 +1,357 @@ +import contextlib +import importlib +import torch +import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import + +# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return + + +class CondFunc: # pylint: disable=missing-class-docstring + def __new__(cls, orig_func, sub_func, cond_func): + self = super(CondFunc, cls).__new__(cls) + if isinstance(orig_func, str): + func_path = orig_func.split(".") + for i in range(len(func_path) - 1, -1, -1): + try: + resolved_obj = importlib.import_module(".".join(func_path[:i])) + break + except ImportError: + pass + for attr_name in func_path[i:-1]: + resolved_obj = getattr(resolved_obj, attr_name) + orig_func = getattr(resolved_obj, func_path[-1]) + setattr( + resolved_obj, + func_path[-1], + lambda *args, **kwargs: self(*args, **kwargs), + ) + self.__init__(orig_func, sub_func, cond_func) + return lambda *args, **kwargs: self(*args, **kwargs) + + def __init__(self, orig_func, sub_func, cond_func): + self.__orig_func = orig_func + self.__sub_func = sub_func + self.__cond_func = cond_func + + def __call__(self, *args, **kwargs): + if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs): + return self.__sub_func(self.__orig_func, *args, **kwargs) + else: + return self.__orig_func(*args, **kwargs) + + +_utils = torch.utils.data._utils + + +def _shutdown_workers(self): + if ( + torch.utils.data._utils is None + or torch.utils.data._utils.python_exit_status is True + or torch.utils.data._utils.python_exit_status is None + ): + return + if hasattr(self, "_shutdown") and not self._shutdown: + self._shutdown = True + try: + if hasattr(self, "_pin_memory_thread"): + self._pin_memory_thread_done_event.set() + self._worker_result_queue.put((None, None)) + self._pin_memory_thread.join() + self._worker_result_queue.cancel_join_thread() + self._worker_result_queue.close() + self._workers_done_event.set() + for worker_id in range(len(self._workers)): + if self._persistent_workers or self._workers_status[worker_id]: + self._mark_worker_as_unavailable(worker_id, shutdown=True) + for w in self._workers: # pylint: disable=invalid-name + w.join(timeout=torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL) + for q in self._index_queues: # pylint: disable=invalid-name + q.cancel_join_thread() + q.close() + finally: + if self._worker_pids_set: + torch.utils.data._utils.signal_handling._remove_worker_pids(id(self)) + self._worker_pids_set = False + for w in self._workers: # pylint: disable=invalid-name + if w.is_alive(): + w.terminate() + + +class DummyDataParallel( + torch.nn.Module +): # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods + def __new__( + cls, module, device_ids=None, output_device=None, dim=0 + ): # pylint: disable=unused-argument + if isinstance(device_ids, list) and len(device_ids) > 1: + print("IPEX backend doesn't support DataParallel on multiple XPU devices") + return module.to("xpu") + + +def return_null_context(*args, **kwargs): # pylint: disable=unused-argument + return contextlib.nullcontext() + + +def check_device(device): + return bool( + (isinstance(device, torch.device) and device.type == "cuda") + or (isinstance(device, str) and "cuda" in device) + or isinstance(device, int) + ) + + +def return_xpu(device): + return ( + f"xpu:{device[-1]}" + if isinstance(device, str) and ":" in device + else f"xpu:{device}" + if isinstance(device, int) + else torch.device("xpu") + if isinstance(device, torch.device) + else "xpu" + ) + + +def ipex_no_cuda(orig_func, *args, **kwargs): + torch.cuda.is_available = lambda: False + orig_func(*args, **kwargs) + torch.cuda.is_available = torch.xpu.is_available + + +original_autocast = torch.autocast + + +def ipex_autocast(*args, **kwargs): + if len(args) > 0 and args[0] == "cuda": + return original_autocast("xpu", *args[1:], **kwargs) + else: + return original_autocast(*args, **kwargs) + + +original_torch_cat = torch.cat + + +def torch_cat(tensor, *args, **kwargs): + if len(tensor) == 3 and ( + tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype + ): + return original_torch_cat( + [tensor[0].to(tensor[1].dtype), tensor[1], tensor[2].to(tensor[1].dtype)], + *args, + **kwargs, + ) + else: + return original_torch_cat(tensor, *args, **kwargs) + + +original_interpolate = torch.nn.functional.interpolate + + +def interpolate( + tensor, + size=None, + scale_factor=None, + mode="nearest", + align_corners=None, + recompute_scale_factor=None, + antialias=False, +): # pylint: disable=too-many-arguments + if antialias or align_corners is not None: + return_device = tensor.device + return_dtype = tensor.dtype + return original_interpolate( + tensor.to("cpu", dtype=torch.float32), + size=size, + scale_factor=scale_factor, + mode=mode, + align_corners=align_corners, + recompute_scale_factor=recompute_scale_factor, + antialias=antialias, + ).to(return_device, dtype=return_dtype) + else: + return original_interpolate( + tensor, + size=size, + scale_factor=scale_factor, + mode=mode, + align_corners=align_corners, + recompute_scale_factor=recompute_scale_factor, + antialias=antialias, + ) + + +original_linalg_solve = torch.linalg.solve + + +def linalg_solve(A, B, *args, **kwargs): # pylint: disable=invalid-name + if A.device != torch.device("cpu") or B.device != torch.device("cpu"): + return_device = A.device + return original_linalg_solve(A.to("cpu"), B.to("cpu"), *args, **kwargs).to( + return_device + ) + else: + return original_linalg_solve(A, B, *args, **kwargs) + + +def ipex_hijacks(): + CondFunc( + "torch.Tensor.to", + lambda orig_func, self, device=None, *args, **kwargs: orig_func( + self, return_xpu(device), *args, **kwargs + ), + lambda orig_func, self, device=None, *args, **kwargs: check_device(device), + ) + CondFunc( + "torch.Tensor.cuda", + lambda orig_func, self, device=None, *args, **kwargs: orig_func( + self, return_xpu(device), *args, **kwargs + ), + lambda orig_func, self, device=None, *args, **kwargs: check_device(device), + ) + CondFunc( + "torch.empty", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + CondFunc( + "torch.load", + lambda orig_func, *args, map_location=None, **kwargs: orig_func( + *args, return_xpu(map_location), **kwargs + ), + lambda orig_func, *args, map_location=None, **kwargs: map_location is None + or check_device(map_location), + ) + CondFunc( + "torch.randn", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + CondFunc( + "torch.ones", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + CondFunc( + "torch.zeros", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + CondFunc( + "torch.tensor", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + CondFunc( + "torch.linspace", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + + CondFunc( + "torch.Generator", + lambda orig_func, device=None: torch.xpu.Generator(device), + lambda orig_func, device=None: device is not None + and device != torch.device("cpu") + and device != "cpu", + ) + + CondFunc( + "torch.batch_norm", + lambda orig_func, input, weight, bias, *args, **kwargs: orig_func( + input, + weight + if weight is not None + else torch.ones(input.size()[1], device=input.device), + bias + if bias is not None + else torch.zeros(input.size()[1], device=input.device), + *args, + **kwargs, + ), + lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"), + ) + CondFunc( + "torch.instance_norm", + lambda orig_func, input, weight, bias, *args, **kwargs: orig_func( + input, + weight + if weight is not None + else torch.ones(input.size()[1], device=input.device), + bias + if bias is not None + else torch.zeros(input.size()[1], device=input.device), + *args, + **kwargs, + ), + lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"), + ) + + # Functions with dtype errors: + CondFunc( + "torch.nn.modules.GroupNorm.forward", + lambda orig_func, self, input: orig_func( + self, input.to(self.weight.data.dtype) + ), + lambda orig_func, self, input: input.dtype != self.weight.data.dtype, + ) + CondFunc( + "torch.nn.modules.linear.Linear.forward", + lambda orig_func, self, input: orig_func( + self, input.to(self.weight.data.dtype) + ), + lambda orig_func, self, input: input.dtype != self.weight.data.dtype, + ) + CondFunc( + "torch.nn.modules.conv.Conv2d.forward", + lambda orig_func, self, input: orig_func( + self, input.to(self.weight.data.dtype) + ), + lambda orig_func, self, input: input.dtype != self.weight.data.dtype, + ) + CondFunc( + "torch.nn.functional.layer_norm", + lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: orig_func( + input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs + ), + lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: weight + is not None + and input.dtype != weight.data.dtype, + ) + + # Diffusers Float64 (ARC GPUs doesn't support double or Float64): + if not torch.xpu.has_fp64_dtype(): + CondFunc( + "torch.from_numpy", + lambda orig_func, ndarray: orig_func(ndarray.astype("float32")), + lambda orig_func, ndarray: ndarray.dtype == float, + ) + + # Broken functions when torch.cuda.is_available is True: + CondFunc( + "torch.utils.data.dataloader._BaseDataLoaderIter.__init__", + lambda orig_func, *args, **kwargs: ipex_no_cuda(orig_func, *args, **kwargs), + lambda orig_func, *args, **kwargs: True, + ) + + # Functions that make compile mad with CondFunc: + torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = ( + _shutdown_workers + ) + torch.nn.DataParallel = DummyDataParallel + torch.autocast = ipex_autocast + torch.cat = torch_cat + torch.linalg.solve = linalg_solve + torch.nn.functional.interpolate = interpolate + torch.backends.cuda.sdp_kernel = return_null_context diff --git a/infer/modules/onnx/export.py b/infer/modules/onnx/export.py new file mode 100644 index 0000000..ed4a416 --- /dev/null +++ b/infer/modules/onnx/export.py @@ -0,0 +1,52 @@ +import torch + +from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM + + +def export_onnx(ModelPath, ExportedPath): + cpt = torch.load(ModelPath, map_location="cpu") + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 + + test_phone = torch.rand(1, 200, vec_channels) # hidden unit + test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) + test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) + test_pitchf = torch.rand(1, 200) # nsf基频 + test_ds = torch.LongTensor([0]) # 说话人ID + test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) + + device = "cpu" # 导出时设备(不影响使用模型) + + net_g = SynthesizerTrnMsNSFsidM( + *cpt["config"], is_half=False, version=cpt.get("version", "v1") + ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g.load_state_dict(cpt["weight"], strict=False) + input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] + output_names = [ + "audio", + ] + # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出 + torch.onnx.export( + net_g, + ( + test_phone.to(device), + test_phone_lengths.to(device), + test_pitch.to(device), + test_pitchf.to(device), + test_ds.to(device), + test_rnd.to(device), + ), + ExportedPath, + dynamic_axes={ + "phone": [1], + "pitch": [1], + "pitchf": [1], + "rnd": [2], + }, + do_constant_folding=False, + opset_version=13, + verbose=False, + input_names=input_names, + output_names=output_names, + ) + return "Finished" diff --git a/infer/modules/train/extract/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py new file mode 100644 index 0000000..f82435e --- /dev/null +++ b/infer/modules/train/extract/extract_f0_print.py @@ -0,0 +1,175 @@ +import os +import sys +import traceback + +import parselmouth + +now_dir = os.getcwd() +sys.path.append(now_dir) +import logging + +import numpy as np +import pyworld + +from infer.lib.audio import load_audio + +logging.getLogger("numba").setLevel(logging.WARNING) +from multiprocessing import Process + +exp_dir = sys.argv[1] +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +n_p = int(sys.argv[2]) +f0method = sys.argv[3] + + +class FeatureInput(object): + def __init__(self, samplerate=16000, hop_size=160): + self.fs = samplerate + self.hop = hop_size + + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + + def compute_f0(self, path, f0_method): + x = load_audio(path, self.fs) + p_len = x.shape[0] // self.hop + if f0_method == "pm": + time_step = 160 / 16000 * 1000 + f0_min = 50 + f0_max = 1100 + f0 = ( + parselmouth.Sound(x, self.fs) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": + f0, t = pyworld.harvest( + x.astype(np.double), + fs=self.fs, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop / self.fs, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) + elif f0_method == "dio": + f0, t = pyworld.dio( + x.astype(np.double), + fs=self.fs, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop / self.fs, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) + elif f0_method == "rmvpe": + if hasattr(self, "model_rmvpe") == False: + from infer.lib.rmvpe import RMVPE + + print("Loading rmvpe model") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu" + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + def go(self, paths, f0_method): + if len(paths) == 0: + printt("no-f0-todo") + else: + printt("todo-f0-%s" % len(paths)) + n = max(len(paths) // 5, 1) # 每个进程最多打印5条 + for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): + try: + if idx % n == 0: + printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) + if ( + os.path.exists(opt_path1 + ".npy") == True + and os.path.exists(opt_path2 + ".npy") == True + ): + continue + featur_pit = self.compute_f0(inp_path, f0_method) + np.save( + opt_path2, + featur_pit, + allow_pickle=False, + ) # nsf + coarse_pit = self.coarse_f0(featur_pit) + np.save( + opt_path1, + coarse_pit, + allow_pickle=False, + ) # ori + except: + printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) + + +if __name__ == "__main__": + # exp_dir=r"E:\codes\py39\dataset\mi-test" + # n_p=16 + # f = open("%s/log_extract_f0.log"%exp_dir, "w") + printt(sys.argv) + featureInput = FeatureInput() + paths = [] + inp_root = "%s/1_16k_wavs" % (exp_dir) + opt_root1 = "%s/2a_f0" % (exp_dir) + opt_root2 = "%s/2b-f0nsf" % (exp_dir) + + os.makedirs(opt_root1, exist_ok=True) + os.makedirs(opt_root2, exist_ok=True) + for name in sorted(list(os.listdir(inp_root))): + inp_path = "%s/%s" % (inp_root, name) + if "spec" in inp_path: + continue + opt_path1 = "%s/%s" % (opt_root1, name) + opt_path2 = "%s/%s" % (opt_root2, name) + paths.append([inp_path, opt_path1, opt_path2]) + + ps = [] + for i in range(n_p): + p = Process( + target=featureInput.go, + args=( + paths[i::n_p], + f0method, + ), + ) + ps.append(p) + p.start() + for i in range(n_p): + ps[i].join() diff --git a/infer/modules/train/extract/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py new file mode 100644 index 0000000..c6c9044 --- /dev/null +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -0,0 +1,141 @@ +import os +import sys +import traceback + +import parselmouth + +now_dir = os.getcwd() +sys.path.append(now_dir) +import logging + +import numpy as np +import pyworld + +from infer.lib.audio import load_audio + +logging.getLogger("numba").setLevel(logging.WARNING) + +n_part = int(sys.argv[1]) +i_part = int(sys.argv[2]) +i_gpu = sys.argv[3] +os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) +exp_dir = sys.argv[4] +is_half = sys.argv[5] +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +class FeatureInput(object): + def __init__(self, samplerate=16000, hop_size=160): + self.fs = samplerate + self.hop = hop_size + + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + + def compute_f0(self, path, f0_method): + x = load_audio(path, self.fs) + # p_len = x.shape[0] // self.hop + if f0_method == "rmvpe": + if hasattr(self, "model_rmvpe") == False: + from infer.lib.rmvpe import RMVPE + + print("Loading rmvpe model") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda" + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + def go(self, paths, f0_method): + if len(paths) == 0: + printt("no-f0-todo") + else: + printt("todo-f0-%s" % len(paths)) + n = max(len(paths) // 5, 1) # 每个进程最多打印5条 + for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): + try: + if idx % n == 0: + printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) + if ( + os.path.exists(opt_path1 + ".npy") == True + and os.path.exists(opt_path2 + ".npy") == True + ): + continue + featur_pit = self.compute_f0(inp_path, f0_method) + np.save( + opt_path2, + featur_pit, + allow_pickle=False, + ) # nsf + coarse_pit = self.coarse_f0(featur_pit) + np.save( + opt_path1, + coarse_pit, + allow_pickle=False, + ) # ori + except: + printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) + + +if __name__ == "__main__": + # exp_dir=r"E:\codes\py39\dataset\mi-test" + # n_p=16 + # f = open("%s/log_extract_f0.log"%exp_dir, "w") + printt(sys.argv) + featureInput = FeatureInput() + paths = [] + inp_root = "%s/1_16k_wavs" % (exp_dir) + opt_root1 = "%s/2a_f0" % (exp_dir) + opt_root2 = "%s/2b-f0nsf" % (exp_dir) + + os.makedirs(opt_root1, exist_ok=True) + os.makedirs(opt_root2, exist_ok=True) + for name in sorted(list(os.listdir(inp_root))): + inp_path = "%s/%s" % (inp_root, name) + if "spec" in inp_path: + continue + opt_path1 = "%s/%s" % (opt_root1, name) + opt_path2 = "%s/%s" % (opt_root2, name) + paths.append([inp_path, opt_path1, opt_path2]) + try: + featureInput.go(paths[i_part::n_part], "rmvpe") + except: + printt("f0_all_fail-%s" % (traceback.format_exc())) + # ps = [] + # for i in range(n_p): + # p = Process( + # target=featureInput.go, + # args=( + # paths[i::n_p], + # f0method, + # ), + # ) + # ps.append(p) + # p.start() + # for i in range(n_p): + # ps[i].join() diff --git a/infer/modules/train/extract/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py new file mode 100644 index 0000000..6abb189 --- /dev/null +++ b/infer/modules/train/extract/extract_f0_rmvpe_dml.py @@ -0,0 +1,139 @@ +import os +import sys +import traceback + +import parselmouth + +now_dir = os.getcwd() +sys.path.append(now_dir) +import logging + +import numpy as np +import pyworld + +from infer.lib.audio import load_audio + +logging.getLogger("numba").setLevel(logging.WARNING) + +exp_dir = sys.argv[1] +import torch_directml + +device = torch_directml.device(torch_directml.default_device()) +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +class FeatureInput(object): + def __init__(self, samplerate=16000, hop_size=160): + self.fs = samplerate + self.hop = hop_size + + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + + def compute_f0(self, path, f0_method): + x = load_audio(path, self.fs) + # p_len = x.shape[0] // self.hop + if f0_method == "rmvpe": + if hasattr(self, "model_rmvpe") == False: + from infer.lib.rmvpe import RMVPE + + print("Loading rmvpe model") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=False, device=device + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + def go(self, paths, f0_method): + if len(paths) == 0: + printt("no-f0-todo") + else: + printt("todo-f0-%s" % len(paths)) + n = max(len(paths) // 5, 1) # 每个进程最多打印5条 + for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): + try: + if idx % n == 0: + printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) + if ( + os.path.exists(opt_path1 + ".npy") == True + and os.path.exists(opt_path2 + ".npy") == True + ): + continue + featur_pit = self.compute_f0(inp_path, f0_method) + np.save( + opt_path2, + featur_pit, + allow_pickle=False, + ) # nsf + coarse_pit = self.coarse_f0(featur_pit) + np.save( + opt_path1, + coarse_pit, + allow_pickle=False, + ) # ori + except: + printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) + + +if __name__ == "__main__": + # exp_dir=r"E:\codes\py39\dataset\mi-test" + # n_p=16 + # f = open("%s/log_extract_f0.log"%exp_dir, "w") + printt(sys.argv) + featureInput = FeatureInput() + paths = [] + inp_root = "%s/1_16k_wavs" % (exp_dir) + opt_root1 = "%s/2a_f0" % (exp_dir) + opt_root2 = "%s/2b-f0nsf" % (exp_dir) + + os.makedirs(opt_root1, exist_ok=True) + os.makedirs(opt_root2, exist_ok=True) + for name in sorted(list(os.listdir(inp_root))): + inp_path = "%s/%s" % (inp_root, name) + if "spec" in inp_path: + continue + opt_path1 = "%s/%s" % (opt_root1, name) + opt_path2 = "%s/%s" % (opt_root2, name) + paths.append([inp_path, opt_path1, opt_path2]) + try: + featureInput.go(paths, "rmvpe") + except: + printt("f0_all_fail-%s" % (traceback.format_exc())) + # ps = [] + # for i in range(n_p): + # p = Process( + # target=featureInput.go, + # args=( + # paths[i::n_p], + # f0method, + # ), + # ) + # ps.append(p) + # p.start() + # for i in range(n_p): + # ps[i].join() diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py new file mode 100644 index 0000000..f771dd9 --- /dev/null +++ b/infer/modules/train/extract_feature_print.py @@ -0,0 +1,137 @@ +import os +import sys +import traceback + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" + +device = sys.argv[1] +n_part = int(sys.argv[2]) +i_part = int(sys.argv[3]) +if len(sys.argv) == 6: + exp_dir = sys.argv[4] + version = sys.argv[5] +else: + i_gpu = sys.argv[4] + exp_dir = sys.argv[5] + os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) + version = sys.argv[6] +import fairseq +import numpy as np +import soundfile as sf +import torch +import torch.nn.functional as F + +if "privateuseone" not in device: + device = "cpu" + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" +else: + import torch_directml + + device = torch_directml.device(torch_directml.default_device()) + + def forward_dml(ctx, x, scale): + ctx.scale = scale + res = x.clone().detach() + return res + + fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml + +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +printt(sys.argv) +model_path = "assets/hubert/hubert_base.pt" + +printt(exp_dir) +wavPath = "%s/1_16k_wavs" % exp_dir +outPath = ( + "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir +) +os.makedirs(outPath, exist_ok=True) + + +# wave must be 16k, hop_size=320 +def readwave(wav_path, normalize=False): + wav, sr = sf.read(wav_path) + assert sr == 16000 + feats = torch.from_numpy(wav).float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + if normalize: + with torch.no_grad(): + feats = F.layer_norm(feats, feats.shape) + feats = feats.view(1, -1) + return feats + + +# HuBERT model +printt("load model(s) from {}".format(model_path)) +# if hubert model is exist +if os.access(model_path, os.F_OK) == False: + printt( + "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main" + % model_path + ) + exit(0) +models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( + [model_path], + suffix="", +) +model = models[0] +model = model.to(device) +printt("move model to %s" % device) +if device not in ["mps", "cpu"]: + model = model.half() +model.eval() + +todo = sorted(list(os.listdir(wavPath)))[i_part::n_part] +n = max(1, len(todo) // 10) # 最多打印十条 +if len(todo) == 0: + printt("no-feature-todo") +else: + printt("all-feature-%s" % len(todo)) + for idx, file in enumerate(todo): + try: + if file.endswith(".wav"): + wav_path = "%s/%s" % (wavPath, file) + out_path = "%s/%s" % (outPath, file.replace("wav", "npy")) + + if os.path.exists(out_path): + continue + + feats = readwave(wav_path, normalize=saved_cfg.task.normalize) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.half().to(device) + if device not in ["mps", "cpu"] + else feats.to(device), + "padding_mask": padding_mask.to(device), + "output_layer": 9 if version == "v1" else 12, # layer 9 + } + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = ( + model.final_proj(logits[0]) if version == "v1" else logits[0] + ) + + feats = feats.squeeze(0).float().cpu().numpy() + if np.isnan(feats).sum() == 0: + np.save(out_path, feats, allow_pickle=False) + else: + printt("%s-contains nan" % file) + if idx % n == 0: + printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape)) + except: + printt(traceback.format_exc()) + printt("all-feature-done") diff --git a/infer/modules/train/preprocess.py b/infer/modules/train/preprocess.py new file mode 100644 index 0000000..fbe8130 --- /dev/null +++ b/infer/modules/train/preprocess.py @@ -0,0 +1,147 @@ +import multiprocessing +import os +import sys + +from scipy import signal + +now_dir = os.getcwd() +sys.path.append(now_dir) +print(sys.argv) +inp_root = sys.argv[1] +sr = int(sys.argv[2]) +n_p = int(sys.argv[3]) +exp_dir = sys.argv[4] +noparallel = sys.argv[5] == "True" +per = float(sys.argv[6]) +import multiprocessing +import os +import traceback + +import librosa +import numpy as np +from scipy.io import wavfile + +from infer.lib.audio import load_audio +from infer.lib.slicer2 import Slicer + +mutex = multiprocessing.Lock() +f = open("%s/preprocess.log" % exp_dir, "a+") + + +def println(strr): + mutex.acquire() + print(strr) + f.write("%s\n" % strr) + f.flush() + mutex.release() + + +class PreProcess: + def __init__(self, sr, exp_dir, per=3.7): + self.slicer = Slicer( + sr=sr, + threshold=-42, + min_length=1500, + min_interval=400, + hop_size=15, + max_sil_kept=500, + ) + self.sr = sr + self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) + self.per = per + self.overlap = 0.3 + self.tail = self.per + self.overlap + self.max = 0.9 + self.alpha = 0.75 + self.exp_dir = exp_dir + self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir + self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir + os.makedirs(self.exp_dir, exist_ok=True) + os.makedirs(self.gt_wavs_dir, exist_ok=True) + os.makedirs(self.wavs16k_dir, exist_ok=True) + + def norm_write(self, tmp_audio, idx0, idx1): + tmp_max = np.abs(tmp_audio).max() + if tmp_max > 2.5: + print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max)) + return + tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + ( + 1 - self.alpha + ) * tmp_audio + wavfile.write( + "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), + self.sr, + tmp_audio.astype(np.float32), + ) + tmp_audio = librosa.resample( + tmp_audio, orig_sr=self.sr, target_sr=16000 + ) # , res_type="soxr_vhq" + wavfile.write( + "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), + 16000, + tmp_audio.astype(np.float32), + ) + + def pipeline(self, path, idx0): + try: + audio = load_audio(path, self.sr) + # zero phased digital filter cause pre-ringing noise... + # audio = signal.filtfilt(self.bh, self.ah, audio) + audio = signal.lfilter(self.bh, self.ah, audio) + + idx1 = 0 + for audio in self.slicer.slice(audio): + i = 0 + while 1: + start = int(self.sr * (self.per - self.overlap) * i) + i += 1 + if len(audio[start:]) > self.tail * self.sr: + tmp_audio = audio[start : start + int(self.per * self.sr)] + self.norm_write(tmp_audio, idx0, idx1) + idx1 += 1 + else: + tmp_audio = audio[start:] + idx1 += 1 + break + self.norm_write(tmp_audio, idx0, idx1) + println("%s->Suc." % path) + except: + println("%s->%s" % (path, traceback.format_exc())) + + def pipeline_mp(self, infos): + for path, idx0 in infos: + self.pipeline(path, idx0) + + def pipeline_mp_inp_dir(self, inp_root, n_p): + try: + infos = [ + ("%s/%s" % (inp_root, name), idx) + for idx, name in enumerate(sorted(list(os.listdir(inp_root)))) + ] + if noparallel: + for i in range(n_p): + self.pipeline_mp(infos[i::n_p]) + else: + ps = [] + for i in range(n_p): + p = multiprocessing.Process( + target=self.pipeline_mp, args=(infos[i::n_p],) + ) + ps.append(p) + p.start() + for i in range(n_p): + ps[i].join() + except: + println("Fail. %s" % traceback.format_exc()) + + +def preprocess_trainset(inp_root, sr, n_p, exp_dir, per): + pp = PreProcess(sr, exp_dir, per) + println("start preprocess") + println(sys.argv) + pp.pipeline_mp_inp_dir(inp_root, n_p) + println("end preprocess") + + +if __name__ == "__main__": + preprocess_trainset(inp_root, sr, n_p, exp_dir, per) diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py new file mode 100644 index 0000000..ab6ba2d --- /dev/null +++ b/infer/modules/train/train.py @@ -0,0 +1,643 @@ +import os +import sys +import logging + +logger = logging.getLogger(__name__) + +now_dir = os.getcwd() +sys.path.append(os.path.join(now_dir)) + +import datetime + +from infer.lib.train import utils + +hps = utils.get_hparams() +os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",") +n_gpus = len(hps.gpus.split("-")) +from random import randint, shuffle + +import torch + +try: + import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import + + if torch.xpu.is_available(): + from infer.modules.ipex import ipex_init + from infer.modules.ipex.gradscaler import gradscaler_init + from torch.xpu.amp import autocast + + GradScaler = gradscaler_init() + ipex_init() + else: + from torch.cuda.amp import GradScaler, autocast +except Exception: + from torch.cuda.amp import GradScaler, autocast + +torch.backends.cudnn.deterministic = False +torch.backends.cudnn.benchmark = False +from time import sleep +from time import time as ttime + +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.nn import functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +from infer.lib.infer_pack import commons +from infer.lib.train.data_utils import ( + DistributedBucketSampler, + TextAudioCollate, + TextAudioCollateMultiNSFsid, + TextAudioLoader, + TextAudioLoaderMultiNSFsid, +) + +if hps.version == "v1": + from infer.lib.infer_pack.models import MultiPeriodDiscriminator + from infer.lib.infer_pack.models import SynthesizerTrnMs256NSFsid as RVC_Model_f0 + from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0, + ) +else: + from infer.lib.infer_pack.models import ( + SynthesizerTrnMs768NSFsid as RVC_Model_f0, + SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0, + MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator, + ) + +from infer.lib.train.losses import ( + discriminator_loss, + feature_loss, + generator_loss, + kl_loss, +) +from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from infer.lib.train.process_ckpt import savee + +global_step = 0 + + +class EpochRecorder: + def __init__(self): + self.last_time = ttime() + + def record(self): + now_time = ttime() + elapsed_time = now_time - self.last_time + self.last_time = now_time + elapsed_time_str = str(datetime.timedelta(seconds=elapsed_time)) + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + return f"[{current_time}] | ({elapsed_time_str})" + + +def main(): + n_gpus = torch.cuda.device_count() + + if torch.cuda.is_available() == False and torch.backends.mps.is_available() == True: + n_gpus = 1 + if n_gpus < 1: + # patch to unblock people without gpus. there is probably a better way. + print("NO GPU DETECTED: falling back to CPU - this may take a while") + n_gpus = 1 + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(randint(20000, 55555)) + children = [] + for i in range(n_gpus): + subproc = mp.Process( + target=run, + args=(i, n_gpus, hps), + ) + children.append(subproc) + subproc.start() + + for i in range(n_gpus): + children[i].join() + + +def run( + rank, + n_gpus, + hps, +): + global global_step + if rank == 0: + logger = utils.get_logger(hps.model_dir) + logger.info(hps) + # utils.check_git_hash(hps.model_dir) + writer = SummaryWriter(log_dir=hps.model_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) + + dist.init_process_group( + backend="gloo", init_method="env://", world_size=n_gpus, rank=rank + ) + torch.manual_seed(hps.train.seed) + if torch.cuda.is_available(): + torch.cuda.set_device(rank) + + if hps.if_f0 == 1: + train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data) + else: + train_dataset = TextAudioLoader(hps.data.training_files, hps.data) + train_sampler = DistributedBucketSampler( + train_dataset, + hps.train.batch_size * n_gpus, + # [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s + [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s + num_replicas=n_gpus, + rank=rank, + shuffle=True, + ) + # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit. + # num_workers=8 -> num_workers=4 + if hps.if_f0 == 1: + collate_fn = TextAudioCollateMultiNSFsid() + else: + collate_fn = TextAudioCollate() + train_loader = DataLoader( + train_dataset, + num_workers=4, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=8, + ) + if hps.if_f0 == 1: + net_g = RVC_Model_f0( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model, + is_half=hps.train.fp16_run, + sr=hps.sample_rate, + ) + else: + net_g = RVC_Model_nof0( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model, + is_half=hps.train.fp16_run, + ) + if torch.cuda.is_available(): + net_g = net_g.cuda(rank) + net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm) + if torch.cuda.is_available(): + net_d = net_d.cuda(rank) + optim_g = torch.optim.AdamW( + net_g.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + optim_d = torch.optim.AdamW( + net_d.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) + # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) + if hasattr(torch, "xpu") and torch.xpu.is_available(): + pass + elif torch.cuda.is_available(): + net_g = DDP(net_g, device_ids=[rank]) + net_d = DDP(net_d, device_ids=[rank]) + else: + net_g = DDP(net_g) + net_d = DDP(net_d) + + try: # 如果能加载自动resume + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d + ) # D多半加载没事 + if rank == 0: + logger.info("loaded D") + # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g + ) + global_step = (epoch_str - 1) * len(train_loader) + # epoch_str = 1 + # global_step = 0 + except: # 如果首次不能加载,加载pretrain + # traceback.print_exc() + epoch_str = 1 + global_step = 0 + if hps.pretrainG != "": + if rank == 0: + logger.info("loaded pretrained %s" % (hps.pretrainG)) + if hasattr(net_g, "module"): + logger.info( + net_g.module.load_state_dict( + torch.load(hps.pretrainG, map_location="cpu")["model"] + ) + ) ##测试不加载优化器 + else: + logger.info( + net_g.load_state_dict( + torch.load(hps.pretrainG, map_location="cpu")["model"] + ) + ) ##测试不加载优化器 + if hps.pretrainD != "": + if rank == 0: + logger.info("loaded pretrained %s" % (hps.pretrainD)) + if hasattr(net_d, "module"): + logger.info( + net_d.module.load_state_dict( + torch.load(hps.pretrainD, map_location="cpu")["model"] + ) + ) + else: + logger.info( + net_d.load_state_dict( + torch.load(hps.pretrainD, map_location="cpu")["model"] + ) + ) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 + ) + + scaler = GradScaler(enabled=hps.train.fp16_run) + + cache = [] + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + logger, + [writer, writer_eval], + cache, + ) + else: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + None, + None, + cache, + ) + scheduler_g.step() + scheduler_d.step() + + +def train_and_evaluate( + rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers, cache +): + net_g, net_d = nets + optim_g, optim_d = optims + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + train_loader.batch_sampler.set_epoch(epoch) + global global_step + + net_g.train() + net_d.train() + + # Prepare data iterator + if hps.if_cache_data_in_gpu == True: + # Use Cache + data_iterator = cache + if cache == []: + # Make new cache + for batch_idx, info in enumerate(train_loader): + # Unpack + if hps.if_f0 == 1: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + else: + ( + phone, + phone_lengths, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + # Load on CUDA + if torch.cuda.is_available(): + phone = phone.cuda(rank, non_blocking=True) + phone_lengths = phone_lengths.cuda(rank, non_blocking=True) + if hps.if_f0 == 1: + pitch = pitch.cuda(rank, non_blocking=True) + pitchf = pitchf.cuda(rank, non_blocking=True) + sid = sid.cuda(rank, non_blocking=True) + spec = spec.cuda(rank, non_blocking=True) + spec_lengths = spec_lengths.cuda(rank, non_blocking=True) + wave = wave.cuda(rank, non_blocking=True) + wave_lengths = wave_lengths.cuda(rank, non_blocking=True) + # Cache on list + if hps.if_f0 == 1: + cache.append( + ( + batch_idx, + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ), + ) + ) + else: + cache.append( + ( + batch_idx, + ( + phone, + phone_lengths, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ), + ) + ) + else: + # Load shuffled cache + shuffle(cache) + else: + # Loader + data_iterator = enumerate(train_loader) + + # Run steps + epoch_recorder = EpochRecorder() + for batch_idx, info in data_iterator: + # Data + ## Unpack + if hps.if_f0 == 1: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + else: + phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info + ## Load on CUDA + if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available(): + phone = phone.cuda(rank, non_blocking=True) + phone_lengths = phone_lengths.cuda(rank, non_blocking=True) + if hps.if_f0 == 1: + pitch = pitch.cuda(rank, non_blocking=True) + pitchf = pitchf.cuda(rank, non_blocking=True) + sid = sid.cuda(rank, non_blocking=True) + spec = spec.cuda(rank, non_blocking=True) + spec_lengths = spec_lengths.cuda(rank, non_blocking=True) + wave = wave.cuda(rank, non_blocking=True) + # wave_lengths = wave_lengths.cuda(rank, non_blocking=True) + + # Calculate + with autocast(enabled=hps.train.fp16_run): + if hps.if_f0 == 1: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid) + else: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, spec, spec_lengths, sid) + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + y_mel = commons.slice_segments( + mel, ids_slice, hps.train.segment_size // hps.data.hop_length + ) + with autocast(enabled=False): + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + if hps.train.fp16_run == True: + y_hat_mel = y_hat_mel.half() + wave = commons.slice_segments( + wave, ids_slice * hps.data.hop_length, hps.train.segment_size + ) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, y_d_hat_g + ) + optim_d.zero_grad() + scaler.scale(loss_disc).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, 100.0 * batch_idx / len(train_loader) + ) + ) + # Amor For Tensorboard display + if loss_mel > 75: + loss_mel = 75 + if loss_kl > 9: + loss_kl = 9 + + logger.info([global_step, lr]) + logger.info( + f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" + ) + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } + scalar_dict.update( + { + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl": loss_kl, + } + ) + + scalar_dict.update( + {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} + ) + scalar_dict.update( + {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)} + ) + scalar_dict.update( + {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} + ) + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy() + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy() + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy() + ), + } + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + global_step += 1 + # /Run steps + + if epoch % hps.save_every_epoch == 0 and rank == 0: + if hps.if_latest == 0: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(global_step)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), + ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(2333333)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), + ) + if rank == 0 and hps.save_every_weights == "1": + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving ckpt %s_e%s:%s" + % ( + hps.name, + epoch, + savee( + ckpt, + hps.sample_rate, + hps.if_f0, + hps.name + "_e%s_s%s" % (epoch, global_step), + epoch, + hps.version, + hps, + ), + ) + ) + + if rank == 0: + logger.info("====> Epoch: {} {}".format(epoch, epoch_recorder.record())) + if epoch >= hps.total_epoch and rank == 0: + logger.info("Training is done. The program is closed.") + + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving final ckpt:%s" + % ( + savee( + ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps + ) + ) + ) + sleep(1) + os._exit(2333333) + + +if __name__ == "__main__": + torch.multiprocessing.set_start_method("spawn") + main() diff --git a/infer/modules/uvr5/.ipynb_checkpoints/vr-checkpoint.py b/infer/modules/uvr5/.ipynb_checkpoints/vr-checkpoint.py new file mode 100644 index 0000000..3335c27 --- /dev/null +++ b/infer/modules/uvr5/.ipynb_checkpoints/vr-checkpoint.py @@ -0,0 +1,195 @@ +import os +import logging + +logger = logging.getLogger(__name__) + +import librosa +import numpy as np +import soundfile as sf +import torch + +from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets +from infer.lib.uvr5_pack.lib_v5 import spec_utils +from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters +from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet +from infer.lib.uvr5_pack.utils import inference + + +class AudioPre: + def __init__(self, agg, model_path, device, is_half, tta=False): + self.model_path = model_path + self.device = device + self.data = { + # Processing Options + "postprocess": False, + "tta": tta, + # Constants + "window_size": 512, + "agg": agg, + "high_end_process": "mirroring", + } + mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") + model = Nets.CascadedASPPNet(mp.param["bins"] * 2) + cpk = torch.load(model_path, map_location="cpu") + model.load_state_dict(cpk) + model.eval() + if is_half: + model = model.half().to(device) + else: + model = model.to(device) + + self.mp = mp + self.model = model + + def _path_audio_( + self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False + ): + if ins_root is None and vocal_root is None: + return "No save root." + name = os.path.basename(music_file) + if ins_root is not None: + os.makedirs(ins_root, exist_ok=True) + if vocal_root is not None: + os.makedirs(vocal_root, exist_ok=True) + X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} + bands_n = len(self.mp.param["band"]) + # print(bands_n) + for d in range(bands_n, 0, -1): + bp = self.mp.param["band"][d] + if d == bands_n: # high-end band + ( + X_wave[d], + _, + ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 + music_file, + bp["sr"], + False, + dtype=np.float32, + res_type=bp["res_type"], + ) + if X_wave[d].ndim == 1: + X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) + else: # lower bands + X_wave[d] = librosa.core.resample( + X_wave[d + 1], + self.mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) + # Stft of wave source + X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( + X_wave[d], + bp["hl"], + bp["n_fft"], + self.mp.param["mid_side"], + self.mp.param["mid_side_b2"], + self.mp.param["reverse"], + ) + # pdb.set_trace() + if d == bands_n and self.data["high_end_process"] != "none": + input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( + self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] + ) + input_high_end = X_spec_s[d][ + :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : + ] + + X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) + aggresive_set = float(self.data["agg"] / 100) + aggressiveness = { + "value": aggresive_set, + "split_bin": self.mp.param["band"][1]["crop_stop"], + } + with torch.no_grad(): + pred, X_mag, X_phase = inference( + X_spec_m, self.device, self.model, aggressiveness, self.data + ) + # Postprocess + if self.data["postprocess"]: + pred_inv = np.clip(X_mag - pred, 0, np.inf) + pred = spec_utils.mask_silence(pred, pred_inv) + y_spec_m = pred * X_phase + v_spec_m = X_spec_m - y_spec_m + + if ins_root is not None: + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], y_spec_m, input_high_end, self.mp + ) + wav_instrument = spec_utils.cmb_spectrogram_to_wave( + y_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) + logger.info("%s instruments done" % name) + if is_hp3 == True: + head = "vocal_" + else: + head = "instrument_" + if format in ["wav", "flac"]: + sf.write( + os.path.join( + ins_root, + head + "{}_{}.{}".format(name, self.data["agg"], format), + ), + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) # + else: + path = os.path.join( + ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + opt_format_path = path[:-4] + ".%s" % format + os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass + if vocal_root is not None: + if is_hp3 == True: + head = "instrument_" + else: + head = "vocal_" + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], v_spec_m, input_high_end, self.mp + ) + wav_vocals = spec_utils.cmb_spectrogram_to_wave( + v_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) + logger.info("%s vocals done" % name) + if format in ["wav", "flac"]: + sf.write( + os.path.join( + vocal_root, + head + "{}_{}.{}".format(name, self.data["agg"], format), + ), + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + else: + path = os.path.join( + vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + opt_format_path = path[:-4] + ".%s" % format + os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass diff --git a/infer/modules/uvr5/__pycache__/mdxnet.cpython-310.pyc b/infer/modules/uvr5/__pycache__/mdxnet.cpython-310.pyc new file mode 100644 index 0000000..815de63 Binary files /dev/null and b/infer/modules/uvr5/__pycache__/mdxnet.cpython-310.pyc differ diff --git a/infer/modules/uvr5/__pycache__/vr.cpython-310.pyc b/infer/modules/uvr5/__pycache__/vr.cpython-310.pyc new file mode 100644 index 0000000..fec3520 Binary files /dev/null and b/infer/modules/uvr5/__pycache__/vr.cpython-310.pyc differ diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py new file mode 100644 index 0000000..2f246db --- /dev/null +++ b/infer/modules/uvr5/mdxnet.py @@ -0,0 +1,256 @@ +import os +import logging + +logger = logging.getLogger(__name__) + +import librosa +import numpy as np +import soundfile as sf +import torch +from tqdm import tqdm + +cpu = torch.device("cpu") + + +class ConvTDFNetTrim: + def __init__( + self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 + ): + super(ConvTDFNetTrim, self).__init__() + + self.dim_f = dim_f + self.dim_t = 2**dim_t + self.n_fft = n_fft + self.hop = hop + self.n_bins = self.n_fft // 2 + 1 + self.chunk_size = hop * (self.dim_t - 1) + self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to( + device + ) + self.target_name = target_name + self.blender = "blender" in model_name + + self.dim_c = 4 + out_c = self.dim_c * 4 if target_name == "*" else self.dim_c + self.freq_pad = torch.zeros( + [1, out_c, self.n_bins - self.dim_f, self.dim_t] + ).to(device) + + self.n = L // 2 + + def stft(self, x): + x = x.reshape([-1, self.chunk_size]) + x = torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop, + window=self.window, + center=True, + return_complex=True, + ) + x = torch.view_as_real(x) + x = x.permute([0, 3, 1, 2]) + x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( + [-1, self.dim_c, self.n_bins, self.dim_t] + ) + return x[:, :, : self.dim_f] + + def istft(self, x, freq_pad=None): + freq_pad = ( + self.freq_pad.repeat([x.shape[0], 1, 1, 1]) + if freq_pad is None + else freq_pad + ) + x = torch.cat([x, freq_pad], -2) + c = 4 * 2 if self.target_name == "*" else 2 + x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape( + [-1, 2, self.n_bins, self.dim_t] + ) + x = x.permute([0, 2, 3, 1]) + x = x.contiguous() + x = torch.view_as_complex(x) + x = torch.istft( + x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True + ) + return x.reshape([-1, c, self.chunk_size]) + + +def get_models(device, dim_f, dim_t, n_fft): + return ConvTDFNetTrim( + device=device, + model_name="Conv-TDF", + target_name="vocals", + L=11, + dim_f=dim_f, + dim_t=dim_t, + n_fft=n_fft, + ) + + +class Predictor: + def __init__(self, args): + import onnxruntime as ort + + logger.info(ort.get_available_providers()) + self.args = args + self.model_ = get_models( + device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft + ) + self.model = ort.InferenceSession( + os.path.join(args.onnx, self.model_.target_name + ".onnx"), + providers=[ + "CUDAExecutionProvider", + "DmlExecutionProvider", + "CPUExecutionProvider", + ], + ) + logger.info("ONNX load done") + + def demix(self, mix): + samples = mix.shape[-1] + margin = self.args.margin + chunk_size = self.args.chunks * 44100 + assert not margin == 0, "margin cannot be zero!" + if margin > chunk_size: + margin = chunk_size + + segmented_mix = {} + + if self.args.chunks == 0 or samples < chunk_size: + chunk_size = samples + + counter = -1 + for skip in range(0, samples, chunk_size): + counter += 1 + + s_margin = 0 if counter == 0 else margin + end = min(skip + chunk_size + margin, samples) + + start = skip - s_margin + + segmented_mix[skip] = mix[:, start:end].copy() + if end == samples: + break + + sources = self.demix_base(segmented_mix, margin_size=margin) + """ + mix:(2,big_sample) + segmented_mix:offset->(2,small_sample) + sources:(1,2,big_sample) + """ + return sources + + def demix_base(self, mixes, margin_size): + chunked_sources = [] + progress_bar = tqdm(total=len(mixes)) + progress_bar.set_description("Processing") + for mix in mixes: + cmix = mixes[mix] + sources = [] + n_sample = cmix.shape[1] + model = self.model_ + trim = model.n_fft // 2 + gen_size = model.chunk_size - 2 * trim + pad = gen_size - n_sample % gen_size + mix_p = np.concatenate( + (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1 + ) + mix_waves = [] + i = 0 + while i < n_sample + pad: + waves = np.array(mix_p[:, i : i + model.chunk_size]) + mix_waves.append(waves) + i += gen_size + mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu) + with torch.no_grad(): + _ort = self.model + spek = model.stft(mix_waves) + if self.args.denoise: + spec_pred = ( + -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5 + + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5 + ) + tar_waves = model.istft(torch.tensor(spec_pred)) + else: + tar_waves = model.istft( + torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0]) + ) + tar_signal = ( + tar_waves[:, :, trim:-trim] + .transpose(0, 1) + .reshape(2, -1) + .numpy()[:, :-pad] + ) + + start = 0 if mix == 0 else margin_size + end = None if mix == list(mixes.keys())[::-1][0] else -margin_size + if margin_size == 0: + end = None + sources.append(tar_signal[:, start:end]) + + progress_bar.update(1) + + chunked_sources.append(sources) + _sources = np.concatenate(chunked_sources, axis=-1) + # del self.model + progress_bar.close() + return _sources + + def prediction(self, m, vocal_root, others_root, format): + os.makedirs(vocal_root, exist_ok=True) + os.makedirs(others_root, exist_ok=True) + basename = os.path.basename(m) + mix, rate = librosa.load(m, mono=False, sr=44100) + if mix.ndim == 1: + mix = np.asfortranarray([mix, mix]) + mix = mix.T + sources = self.demix(mix.T) + opt = sources[0].T + if format in ["wav", "flac"]: + sf.write( + "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate + ) + sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) + else: + path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) + path_other = "%s/%s_others.wav" % (others_root, basename) + sf.write(path_vocal, mix - opt, rate) + sf.write(path_other, opt, rate) + opt_path_vocal = path_vocal[:-4] + ".%s" % format + opt_path_other = path_other[:-4] + ".%s" % format + if os.path.exists(path_vocal): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal) + ) + if os.path.exists(opt_path_vocal): + try: + os.remove(path_vocal) + except: + pass + if os.path.exists(path_other): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other) + ) + if os.path.exists(opt_path_other): + try: + os.remove(path_other) + except: + pass + + +class MDXNetDereverb: + def __init__(self, chunks, device): + self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy" + self.shifts = 10 # 'Predict with randomised equivariant stabilisation' + self.mixing = "min_mag" # ['default','min_mag','max_mag'] + self.chunks = chunks + self.margin = 44100 + self.dim_t = 9 + self.dim_f = 3072 + self.n_fft = 6144 + self.denoise = True + self.pred = Predictor(self) + self.device = device + + def _path_audio_(self, input, vocal_root, others_root, format, is_hp3=False): + self.pred.prediction(input, vocal_root, others_root, format) diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py new file mode 100644 index 0000000..bce3cef --- /dev/null +++ b/infer/modules/uvr5/modules.py @@ -0,0 +1,108 @@ +import os +import traceback +import logging + +logger = logging.getLogger(__name__) + +import ffmpeg +import torch + +from configs.config import Config +from infer.modules.uvr5.mdxnet import MDXNetDereverb +from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho + +config = Config() + + +def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): + infos = [] + try: + inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + save_root_vocal = ( + save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) + save_root_ins = ( + save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) + if model_name == "onnx_dereverb_By_FoxJoy": + pre_fun = MDXNetDereverb(15, config.device) + else: + func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho + pre_fun = func( + agg=int(agg), + model_path=os.path.join( + os.getenv("weight_uvr5_root"), model_name + ".pth" + ), + device=config.device, + is_half=config.is_half, + ) + is_hp3 = "HP3" in model_name + if inp_root != "": + paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] + else: + paths = [path.name for path in paths] + for path in paths: + inp_path = os.path.join(inp_root, path) + need_reformat = 1 + done = 0 + try: + info = ffmpeg.probe(inp_path, cmd="ffprobe") + if ( + info["streams"][0]["channels"] == 2 + and info["streams"][0]["sample_rate"] == "44100" + ): + need_reformat = 0 + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 + ) + done = 1 + except: + need_reformat = 1 + traceback.print_exc() + if need_reformat == 1: + tmp_path = "%s/%s.reformatted.wav" % ( + os.path.join(os.environ["TEMP"]), + os.path.basename(inp_path), + ) + os.system( + "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" + % (inp_path, tmp_path) + ) + inp_path = tmp_path + try: + if done == 0: + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0 + ) + infos.append("%s->Success" % (os.path.basename(inp_path))) + yield "\n".join(infos) + except: + try: + if done == 0: + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0 + ) + infos.append("%s->Success" % (os.path.basename(inp_path))) + yield "\n".join(infos) + except: + infos.append( + "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) + ) + yield "\n".join(infos) + except: + infos.append(traceback.format_exc()) + yield "\n".join(infos) + finally: + try: + if model_name == "onnx_dereverb_By_FoxJoy": + del pre_fun.pred.model + del pre_fun.pred.model_ + else: + del pre_fun.model + del pre_fun + except: + traceback.print_exc() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + logger.info("Executed torch.cuda.empty_cache()") + yield "\n".join(infos) diff --git a/infer/modules/uvr5/vr.py b/infer/modules/uvr5/vr.py new file mode 100644 index 0000000..3335c27 --- /dev/null +++ b/infer/modules/uvr5/vr.py @@ -0,0 +1,195 @@ +import os +import logging + +logger = logging.getLogger(__name__) + +import librosa +import numpy as np +import soundfile as sf +import torch + +from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets +from infer.lib.uvr5_pack.lib_v5 import spec_utils +from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters +from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet +from infer.lib.uvr5_pack.utils import inference + + +class AudioPre: + def __init__(self, agg, model_path, device, is_half, tta=False): + self.model_path = model_path + self.device = device + self.data = { + # Processing Options + "postprocess": False, + "tta": tta, + # Constants + "window_size": 512, + "agg": agg, + "high_end_process": "mirroring", + } + mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") + model = Nets.CascadedASPPNet(mp.param["bins"] * 2) + cpk = torch.load(model_path, map_location="cpu") + model.load_state_dict(cpk) + model.eval() + if is_half: + model = model.half().to(device) + else: + model = model.to(device) + + self.mp = mp + self.model = model + + def _path_audio_( + self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False + ): + if ins_root is None and vocal_root is None: + return "No save root." + name = os.path.basename(music_file) + if ins_root is not None: + os.makedirs(ins_root, exist_ok=True) + if vocal_root is not None: + os.makedirs(vocal_root, exist_ok=True) + X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} + bands_n = len(self.mp.param["band"]) + # print(bands_n) + for d in range(bands_n, 0, -1): + bp = self.mp.param["band"][d] + if d == bands_n: # high-end band + ( + X_wave[d], + _, + ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 + music_file, + bp["sr"], + False, + dtype=np.float32, + res_type=bp["res_type"], + ) + if X_wave[d].ndim == 1: + X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) + else: # lower bands + X_wave[d] = librosa.core.resample( + X_wave[d + 1], + self.mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) + # Stft of wave source + X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( + X_wave[d], + bp["hl"], + bp["n_fft"], + self.mp.param["mid_side"], + self.mp.param["mid_side_b2"], + self.mp.param["reverse"], + ) + # pdb.set_trace() + if d == bands_n and self.data["high_end_process"] != "none": + input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( + self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] + ) + input_high_end = X_spec_s[d][ + :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : + ] + + X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) + aggresive_set = float(self.data["agg"] / 100) + aggressiveness = { + "value": aggresive_set, + "split_bin": self.mp.param["band"][1]["crop_stop"], + } + with torch.no_grad(): + pred, X_mag, X_phase = inference( + X_spec_m, self.device, self.model, aggressiveness, self.data + ) + # Postprocess + if self.data["postprocess"]: + pred_inv = np.clip(X_mag - pred, 0, np.inf) + pred = spec_utils.mask_silence(pred, pred_inv) + y_spec_m = pred * X_phase + v_spec_m = X_spec_m - y_spec_m + + if ins_root is not None: + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], y_spec_m, input_high_end, self.mp + ) + wav_instrument = spec_utils.cmb_spectrogram_to_wave( + y_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) + logger.info("%s instruments done" % name) + if is_hp3 == True: + head = "vocal_" + else: + head = "instrument_" + if format in ["wav", "flac"]: + sf.write( + os.path.join( + ins_root, + head + "{}_{}.{}".format(name, self.data["agg"], format), + ), + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) # + else: + path = os.path.join( + ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + opt_format_path = path[:-4] + ".%s" % format + os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass + if vocal_root is not None: + if is_hp3 == True: + head = "instrument_" + else: + head = "vocal_" + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], v_spec_m, input_high_end, self.mp + ) + wav_vocals = spec_utils.cmb_spectrogram_to_wave( + v_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) + logger.info("%s vocals done" % name) + if format in ["wav", "flac"]: + sf.write( + os.path.join( + vocal_root, + head + "{}_{}.{}".format(name, self.data["agg"], format), + ), + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + else: + path = os.path.join( + vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + opt_format_path = path[:-4] + ".%s" % format + os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass diff --git a/infer/modules/vc/__init__.py b/infer/modules/vc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py new file mode 100644 index 0000000..1775012 --- /dev/null +++ b/infer/modules/vc/modules.py @@ -0,0 +1,302 @@ +import traceback +import logging + +logger = logging.getLogger(__name__) + +import numpy as np +import soundfile as sf +import torch +from io import BytesIO + +from infer.lib.audio import load_audio, wav2 +from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) +from infer.modules.vc.pipeline import Pipeline +from infer.modules.vc.utils import * + + +class VC: + def __init__(self, config): + self.n_spk = None + self.tgt_sr = None + self.net_g = None + self.pipeline = None + self.cpt = None + self.version = None + self.if_f0 = None + self.version = None + self.hubert_model = None + + self.config = config + + def get_vc(self, sid, *to_return_protect): + logger.info("Get sid: " + sid) + + to_return_protect0 = { + "visible": self.if_f0 != 0, + "value": to_return_protect[0] + if self.if_f0 != 0 and to_return_protect + else 0.5, + "__type__": "update", + } + to_return_protect1 = { + "visible": self.if_f0 != 0, + "value": to_return_protect[1] + if self.if_f0 != 0 and to_return_protect + else 0.33, + "__type__": "update", + } + + if sid == "" or sid == []: + if self.hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 + logger.info("Clean model cache") + del (self.net_g, self.n_spk, self.hubert_model, self.tgt_sr) # ,cpt + self.hubert_model = ( + self.net_g + ) = self.n_spk = self.hubert_model = self.tgt_sr = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + ###楼下不这么折腾清理不干净 + self.if_f0 = self.cpt.get("f0", 1) + self.version = self.cpt.get("version", "v1") + if self.version == "v1": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs256NSFsid( + *self.cpt["config"], is_half=self.config.is_half + ) + else: + self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"]) + elif self.version == "v2": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs768NSFsid( + *self.cpt["config"], is_half=self.config.is_half + ) + else: + self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"]) + del self.net_g, self.cpt + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return ( + {"visible": False, "__type__": "update"}, + { + "visible": True, + "value": to_return_protect0, + "__type__": "update", + }, + { + "visible": True, + "value": to_return_protect1, + "__type__": "update", + }, + "", + "", + ) + person = f'{os.getenv("weight_root")}/{sid}' + logger.info(f"Loading: {person}") + + self.cpt = torch.load(person, map_location="cpu") + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk + self.if_f0 = self.cpt.get("f0", 1) + self.version = self.cpt.get("version", "v1") + + synthesizer_class = { + ("v1", 1): SynthesizerTrnMs256NSFsid, + ("v1", 0): SynthesizerTrnMs256NSFsid_nono, + ("v2", 1): SynthesizerTrnMs768NSFsid, + ("v2", 0): SynthesizerTrnMs768NSFsid_nono, + } + + self.net_g = synthesizer_class.get( + (self.version, self.if_f0), SynthesizerTrnMs256NSFsid + )(*self.cpt["config"], is_half=self.config.is_half) + + del self.net_g.enc_q + + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + self.net_g.eval().to(self.config.device) + if self.config.is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + + self.pipeline = Pipeline(self.tgt_sr, self.config) + n_spk = self.cpt["config"][-3] + index = {"value": get_index_path_from_model(sid), "__type__": "update"} + logger.info("Select index: " + index["value"]) + + return ( + ( + {"visible": True, "maximum": n_spk, "__type__": "update"}, + to_return_protect0, + to_return_protect1, + index, + index, + ) + if to_return_protect + else {"visible": True, "maximum": n_spk, "__type__": "update"} + ) + + def vc_single( + self, + sid, + input_audio_path, + f0_up_key, + f0_file, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ): + if input_audio_path is None: + return "You need to upload an audio", None + f0_up_key = int(f0_up_key) + try: + audio = load_audio(input_audio_path, 16000) + audio_max = np.abs(audio).max() / 0.95 + if audio_max > 1: + audio /= audio_max + times = [0, 0, 0] + + if self.hubert_model is None: + self.hubert_model = load_hubert(self.config) + + if file_index: + file_index = ( + file_index.strip(" ") + .strip('"') + .strip("\n") + .strip('"') + .strip(" ") + .replace("trained", "added") + ) + elif file_index2: + file_index = file_index2 + else: + file_index = "" # 防止小白写错,自动帮他替换掉 + + audio_opt = self.pipeline.pipeline( + self.hubert_model, + self.net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + self.if_f0, + filter_radius, + self.tgt_sr, + resample_sr, + rms_mix_rate, + self.version, + protect, + f0_file, + ) + if self.tgt_sr != resample_sr >= 16000: + tgt_sr = resample_sr + else: + tgt_sr = self.tgt_sr + index_info = ( + "Index:\n%s." % file_index + if os.path.exists(file_index) + else "Index not used." + ) + return ( + "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs." + % (index_info, *times), + (tgt_sr, audio_opt), + ) + except: + info = traceback.format_exc() + logger.warning(info) + return info, (None, None) + + def vc_multi( + self, + sid, + dir_path, + opt_root, + paths, + f0_up_key, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + format1, + ): + try: + dir_path = ( + dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + os.makedirs(opt_root, exist_ok=True) + try: + if dir_path != "": + paths = [ + os.path.join(dir_path, name) for name in os.listdir(dir_path) + ] + else: + paths = [path.name for path in paths] + except: + traceback.print_exc() + paths = [path.name for path in paths] + infos = [] + for path in paths: + info, opt = self.vc_single( + sid, + path, + f0_up_key, + None, + f0_method, + file_index, + file_index2, + # file_big_npy, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ) + if "Success" in info: + try: + tgt_sr, audio_opt = opt + if format1 in ["wav", "flac"]: + sf.write( + "%s/%s.%s" + % (opt_root, os.path.basename(path), format1), + audio_opt, + tgt_sr, + ) + else: + path = "%s/%s.%s" % ( + opt_root, + os.path.basename(path), + format1, + ) + with BytesIO() as wavf: + sf.write(wavf, audio_opt, tgt_sr, format="wav") + wavf.seek(0, 0) + with open(path, "wb") as outf: + wav2(wavf, outf, format1) + except: + info += traceback.format_exc() + infos.append("%s->%s" % (os.path.basename(path), info)) + yield "\n".join(infos) + yield "\n".join(infos) + except: + yield traceback.format_exc() diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py new file mode 100644 index 0000000..9e3e387 --- /dev/null +++ b/infer/modules/vc/pipeline.py @@ -0,0 +1,457 @@ +import os +import sys +import traceback +import logging + +logger = logging.getLogger(__name__) + +from functools import lru_cache +from time import time as ttime + +import faiss +import librosa +import numpy as np +import parselmouth +import pyworld +import torch +import torch.nn.functional as F +import torchcrepe +from scipy import signal + +now_dir = os.getcwd() +sys.path.append(now_dir) + +bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + +input_audio_path2wav = {} + + +@lru_cache +def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): + audio = input_audio_path2wav[input_audio_path] + f0, t = pyworld.harvest( + audio, + fs=fs, + f0_ceil=f0max, + f0_floor=f0min, + frame_period=frame_period, + ) + f0 = pyworld.stonemask(audio, f0, t, fs) + return f0 + + +def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 + # print(data1.max(),data2.max()) + rms1 = librosa.feature.rms( + y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 + ) # 每半秒一个点 + rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) + rms1 = torch.from_numpy(rms1) + rms1 = F.interpolate( + rms1.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.from_numpy(rms2) + rms2 = F.interpolate( + rms2.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) + data2 *= ( + torch.pow(rms1, torch.tensor(1 - rate)) + * torch.pow(rms2, torch.tensor(rate - 1)) + ).numpy() + return data2 + + +class Pipeline(object): + def __init__(self, tgt_sr, config): + self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( + config.x_pad, + config.x_query, + config.x_center, + config.x_max, + config.is_half, + ) + self.sr = 16000 # hubert输入采样率 + self.window = 160 # 每帧点数 + self.t_pad = self.sr * self.x_pad # 每条前后pad时间 + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * self.x_query # 查询切点前后查询时间 + self.t_center = self.sr * self.x_center # 查询切点位置 + self.t_max = self.sr * self.x_max # 免查询时长阈值 + self.device = config.device + + def get_f0( + self, + input_audio_path, + x, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0=None, + ): + global input_audio_path2wav + time_step = self.window / self.sr * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + if f0_method == "pm": + f0 = ( + parselmouth.Sound(x, self.sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": + input_audio_path2wav[input_audio_path] = x.astype(np.double) + f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) + if filter_radius > 2: + f0 = signal.medfilt(f0, 3) + elif f0_method == "crepe": + model = "full" + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + # Compute pitch using first gpu + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + self.sr, + self.window, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + elif f0_method == "rmvpe": + if not hasattr(self, "model_rmvpe"): + from infer.lib.rmvpe import RMVPE + + logger.info( + "Loading rmvpe model,%s" % "%s/rmvpe.pt" % os.environ["rmvpe_root"] + ) + self.model_rmvpe = RMVPE( + "%s/rmvpe.pt" % os.environ["rmvpe_root"], + is_half=self.is_half, + device=self.device, + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + + if "privateuseone" in str(self.device): # clean ortruntime memory + del self.model_rmvpe.model + del self.model_rmvpe + logger.info("Cleaning ortruntime memory") + + f0 *= pow(2, f0_up_key / 12) + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0 = self.sr // self.window # 每秒f0点数 + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int32) + return f0_coarse, f0bak # 1-0 + + def vc( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + ): # ,file_index,file_big_npy + feats = torch.from_numpy(audio0) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + "output_layer": 9 if version == "v1" else 12, + } + t0 = ttime() + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) if version == "v1" else logits[0] + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = feats.clone() + if ( + not isinstance(index, type(None)) + and not isinstance(big_npy, type(None)) + and index_rate != 0 + ): + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + # _, I = index.search(npy, 1) + # npy = big_npy[I.squeeze()] + + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + t1 = ttime() + p_len = audio0.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + + if protect < 0.5 and pitch is not None and pitchf is not None: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + pitchff = pitchff.unsqueeze(-1) + feats = feats * pitchff + feats0 * (1 - pitchff) + feats = feats.to(feats0.dtype) + p_len = torch.tensor([p_len], device=self.device).long() + with torch.no_grad(): + hasp = pitch is not None and pitchf is not None + arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid) + audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy() + del hasp, arg + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + t2 = ttime() + times[0] += t1 - t0 + times[2] += t2 - t1 + return audio1 + + def pipeline( + self, + model, + net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + if_f0, + filter_radius, + tgt_sr, + resample_sr, + rms_mix_rate, + version, + protect, + f0_file=None, + ): + if ( + file_index != "" + # and file_big_npy != "" + # and os.path.exists(file_big_npy) == True + and os.path.exists(file_index) + and index_rate != 0 + ): + try: + index = faiss.read_index(file_index) + # big_npy = np.load(file_big_npy) + big_npy = index.reconstruct_n(0, index.ntotal) + except: + traceback.print_exc() + index = big_npy = None + else: + index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += np.abs(audio_pad[i : i - self.window]) + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + audio_sum[t - self.t_query : t + self.t_query] + == audio_sum[t - self.t_query : t + self.t_query].min() + )[0][0] + ) + s = 0 + audio_opt = [] + t = None + t1 = ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name"): + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except: + traceback.print_exc() + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = self.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if "mps" not in str(self.device) or "xpu" not in str(self.device): + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + t2 = ttime() + times[1] += t2 - t1 + for t in opt_ts: + t = t // self.window * self.window + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + if rms_mix_rate != 1: + audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) + if tgt_sr != resample_sr >= 16000: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py new file mode 100644 index 0000000..c128707 --- /dev/null +++ b/infer/modules/vc/utils.py @@ -0,0 +1,33 @@ +import os + +from fairseq import checkpoint_utils + + +def get_index_path_from_model(sid): + return next( + ( + f + for f in [ + os.path.join(root, name) + for root, _, files in os.walk(os.getenv("index_root"), topdown=False) + for name in files + if name.endswith(".index") and "trained" not in name + ] + if sid.split(".")[0] in f + ), + "", + ) + + +def load_hubert(config): + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + ["assets/hubert/hubert_base.pt"], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(config.device) + if config.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + return hubert_model.eval()