forked from yjzxkxdn/mel-keyshift-test
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwav2mel.py
More file actions
124 lines (103 loc) · 3.93 KB
/
wav2mel.py
File metadata and controls
124 lines (103 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# 从SingingVocoders项目复制
# https://github.com/openvpi/SingingVocoders/blob/main/utils/wav2mel.py
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.data
from librosa.filters import mel as librosa_mel_fn
# from loguru import logger
class PitchAdjustableMelSpectrogram:
def __init__(
self,
sample_rate=44100,
n_fft=2048,
win_length=2048,
hop_length=512,
f_min=40,
f_max=16000,
n_mels=128,
center=False,
):
self.sample_rate = sample_rate
self.n_fft = n_fft
self.win_size = win_length
self.hop_length = hop_length
self.f_min = f_min
self.f_max = f_max
self.n_mels = n_mels
self.center = center
self.mel_basis = {}
self.hann_window = {}
def __call__(self, y, key_shift=0, speed=1.0):
factor = 2 ** (key_shift / 12)
n_fft_new = int(np.round(self.n_fft * factor))
win_size_new = int(np.round(self.win_size * factor))
hop_length = int(np.round(self.hop_length * speed))
mel_basis_key = f"{self.f_max}_{y.device}"
if mel_basis_key not in self.mel_basis:
mel = librosa_mel_fn(
sr=self.sample_rate,
n_fft=self.n_fft,
n_mels=self.n_mels,
fmin=self.f_min,
fmax=self.f_max,
)
self.mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
hann_window_key = f"{key_shift}_{y.device}"
if hann_window_key not in self.hann_window:
self.hann_window[hann_window_key] = torch.hann_window(
win_size_new, device=y.device
)
y = torch.nn.functional.pad(
y.unsqueeze(1),
(
int((win_size_new - hop_length) // 2),
int((win_size_new - hop_length+1) // 2),
),
mode="reflect",
)
y = y.squeeze(1)
spec = torch.stft(
y,
n_fft_new,
hop_length=hop_length,
win_length=win_size_new,
window=self.hann_window[hann_window_key],
center=self.center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=True,
).abs()
# spec = torch.view_as_real(spec)
# spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
if key_shift != 0:
size = self.n_fft // 2 + 1
resize = spec.size(1)
if resize < size:
spec = F.pad(spec, (0, 0, 0, size - resize))
spec = spec[:, :size, :] * self.win_size / win_size_new
spec = torch.matmul(self.mel_basis[mel_basis_key], spec)
return spec
def dynamic_range_compression_torch(self,x, C=1, clip_val=1e-5):
return torch.log(torch.clamp(x, min=clip_val) * C)
if __name__=='__main__':
import glob
import torchaudio
from tqdm import tqdm
# from concurrent.futures import ProcessPoolExecutor
# import random
# import re
# from torch.multiprocessing import Manager, Process, current_process, get_context
#
# is_main_process = not bool(re.match(r'((.*Process)|(SyncManager)|(.*PoolWorker))-\d+', current_process().name))
lll = glob.glob(r'D:\propj\Disa\data\opencpop\raw\wavs/**.wav')
torch.set_num_threads(1)
for i in tqdm(lll):
audio, sr = torchaudio.load(i)
audio = torch.clamp(audio[0], -1.0, 1.0)
mel_spec_transform=PitchAdjustableMelSpectrogram()
with torch.no_grad():
spectrogram = mel_spec_transform(audio.unsqueeze(0).cuda())*0.434294
# spectrogram = 20 * torch.log10(torch.clamp(spectrogram, min=1e-5)) - 20 #ds 是log10
# spectrogram = torch.log(torch.clamp(spectrogram, min=1e-5))