diff --git a/.gitignore b/.gitignore index 4fdef3e1745..05bf1338fed 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,4 @@ test/.coverage test/data/legacy_serialized.pt *~ .idea +data/debabble diff --git a/models/networks.py b/models/networks.py index 23b075e6e9b..ded80bec8ad 100644 --- a/models/networks.py +++ b/models/networks.py @@ -116,7 +116,7 @@ def init_net(net, init_type='normal', init_gain=0.02, gpu_ids=[]): return net -def define_G(input_nc, output_nc, ngf, netG, norm='batch', use_dropout=False, init_type='normal', init_gain=0.02, gpu_ids=[]): +def define_G(input_nc, output_nc, ngf, netG, norm='batch', use_dropout=False, init_type='normal', init_gain=0.02, gpu_ids=[], kws=None, paddings=None): """Create a generator Parameters: @@ -151,15 +151,15 @@ def define_G(input_nc, output_nc, ngf, netG, norm='batch', use_dropout=False, in elif netG == 'resnet_6blocks': net = ResnetGenerator(input_nc, output_nc, ngf, norm_layer=norm_layer, use_dropout=use_dropout, n_blocks=6) elif netG == 'unet_128': - net = UnetGenerator(input_nc, output_nc, 7, ngf, norm_layer=norm_layer, use_dropout=use_dropout) + net = UnetGenerator(input_nc, output_nc, 7, ngf, norm_layer=norm_layer, use_dropout=use_dropout, kws=kws, paddings=paddings) elif netG == 'unet_256': - net = UnetGenerator(input_nc, output_nc, 8, ngf, norm_layer=norm_layer, use_dropout=use_dropout) + net = UnetGenerator(input_nc, output_nc, 8, ngf, norm_layer=norm_layer, use_dropout=use_dropout, kws=kws, paddings=paddings) else: raise NotImplementedError('Generator model name [%s] is not recognized' % netG) return init_net(net, init_type, init_gain, gpu_ids) -def define_D(input_nc, ndf, netD, n_layers_D=3, norm='batch', init_type='normal', init_gain=0.02, gpu_ids=[]): +def define_D(input_nc, ndf, netD, n_layers_D=3, norm='batch', init_type='normal', init_gain=0.02, gpu_ids=[], kws=None, paddings=None): """Create a discriminator Parameters: @@ -193,9 +193,9 @@ def define_D(input_nc, ndf, netD, n_layers_D=3, norm='batch', init_type='normal' norm_layer = get_norm_layer(norm_type=norm) if netD == 'basic': # default PatchGAN classifier - net = NLayerDiscriminator(input_nc, ndf, n_layers=3, norm_layer=norm_layer) + net = NLayerDiscriminator(input_nc, ndf, n_layers=3, norm_layer=norm_layer, kws=kws, paddings=paddings) elif netD == 'n_layers': # more options - net = NLayerDiscriminator(input_nc, ndf, n_layers_D, norm_layer=norm_layer) + net = NLayerDiscriminator(input_nc, ndf, n_layers_D, norm_layer=norm_layer, kws=kws, paddings=paddings) elif netD == 'pixel': # classify if each pixel is real or fake net = PixelDiscriminator(input_nc, ndf, norm_layer=norm_layer) else: @@ -436,7 +436,7 @@ def forward(self, x): class UnetGenerator(nn.Module): """Create a Unet-based generator""" - def __init__(self, input_nc, output_nc, num_downs, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False): + def __init__(self, input_nc, output_nc, num_downs, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False, kws=None, paddings=None): """Construct a Unet generator Parameters: input_nc (int) -- the number of channels in input images @@ -450,15 +450,24 @@ def __init__(self, input_nc, output_nc, num_downs, ngf=64, norm_layer=nn.BatchNo It is a recursive process. """ super(UnetGenerator, self).__init__() + if kws is None: + self.kws = [None for _ in range(num_downs)] + self.paddings = [None for _ in range(num_downs)] + elif paddings is None: + self.kws = kws + self.paddings = [None for _ in range(num_downs)] + else: + self.kws = kws + self.paddings = paddings # construct unet structure - unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=None, norm_layer=norm_layer, innermost=True) # add the innermost layer + unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=None, norm_layer=norm_layer, innermost=True, kws=self.kws[-1], paddings=self.paddings[-1]) # add the innermost layer for i in range(num_downs - 5): # add intermediate layers with ngf * 8 filters - unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, use_dropout=use_dropout) + unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, use_dropout=use_dropout, kws=self.kws[-i - 2], paddings=self.paddings[-i - 2]) # gradually reduce the number of filters from ngf * 8 to ngf - unet_block = UnetSkipConnectionBlock(ngf * 4, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer) - unet_block = UnetSkipConnectionBlock(ngf * 2, ngf * 4, input_nc=None, submodule=unet_block, norm_layer=norm_layer) - unet_block = UnetSkipConnectionBlock(ngf, ngf * 2, input_nc=None, submodule=unet_block, norm_layer=norm_layer) - self.model = UnetSkipConnectionBlock(output_nc, ngf, input_nc=input_nc, submodule=unet_block, outermost=True, norm_layer=norm_layer) # add the outermost layer + unet_block = UnetSkipConnectionBlock(ngf * 4, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, kws=self.kws[-num_downs + 3], paddings=self.paddings[-num_downs + 3]) + unet_block = UnetSkipConnectionBlock(ngf * 2, ngf * 4, input_nc=None, submodule=unet_block, norm_layer=norm_layer, kws=self.kws[-num_downs + 2], paddings=self.paddings[-num_downs + 2]) + unet_block = UnetSkipConnectionBlock(ngf, ngf * 2, input_nc=None, submodule=unet_block, norm_layer=norm_layer, kws=self.kws[-num_downs + 1], paddings=self.paddings[-num_downs + 1]) + self.model = UnetSkipConnectionBlock(output_nc, ngf, input_nc=input_nc, submodule=unet_block, outermost=True, norm_layer=norm_layer, kws=self.kws[-num_downs], paddings=self.paddings[-num_downs]) # add the outermost layer def forward(self, input): """Standard forward""" @@ -472,7 +481,7 @@ class UnetSkipConnectionBlock(nn.Module): """ def __init__(self, outer_nc, inner_nc, input_nc=None, - submodule=None, outermost=False, innermost=False, norm_layer=nn.BatchNorm2d, use_dropout=False): + submodule=None, outermost=False, innermost=False, norm_layer=nn.BatchNorm2d, use_dropout=False, kws=None, paddings=None): """Construct a Unet submodule with skip connections. Parameters: @@ -486,6 +495,15 @@ def __init__(self, outer_nc, inner_nc, input_nc=None, use_dropout (bool) -- if use dropout layers. """ super(UnetSkipConnectionBlock, self).__init__() + if kws is None: + self.kws = 4 + self.paddings = 1 + elif paddings is None: + self.kws = kws + self.paddings = 1 + else: + self.kws = kws + self.paddings = paddings self.outermost = outermost if type(norm_layer) == functools.partial: use_bias = norm_layer.func == nn.InstanceNorm2d @@ -493,8 +511,8 @@ def __init__(self, outer_nc, inner_nc, input_nc=None, use_bias = norm_layer == nn.InstanceNorm2d if input_nc is None: input_nc = outer_nc - downconv = nn.Conv2d(input_nc, inner_nc, kernel_size=4, - stride=2, padding=1, bias=use_bias) + downconv = nn.Conv2d(input_nc, inner_nc, kernel_size=self.kws, + stride=2, padding=self.paddings, bias=use_bias) downrelu = nn.LeakyReLU(0.2, True) downnorm = norm_layer(inner_nc) uprelu = nn.ReLU(True) @@ -502,22 +520,22 @@ def __init__(self, outer_nc, inner_nc, input_nc=None, if outermost: upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc, - kernel_size=4, stride=2, - padding=1) + kernel_size=self.kws, stride=2, + padding=self.paddings) down = [downconv] up = [uprelu, upconv, nn.Tanh()] model = down + [submodule] + up elif innermost: upconv = nn.ConvTranspose2d(inner_nc, outer_nc, - kernel_size=4, stride=2, - padding=1, bias=use_bias) + kernel_size=self.kws, stride=2, + padding=self.paddings, bias=use_bias) down = [downrelu, downconv] up = [uprelu, upconv, upnorm] model = down + up else: upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc, - kernel_size=4, stride=2, - padding=1, bias=use_bias) + kernel_size=self.kws, stride=2, + padding=self.paddings, bias=use_bias) down = [downrelu, downconv, downnorm] up = [uprelu, upconv, upnorm] @@ -538,7 +556,7 @@ def forward(self, x): class NLayerDiscriminator(nn.Module): """Defines a PatchGAN discriminator""" - def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d): + def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d, kws=None, paddings=None): """Construct a PatchGAN discriminator Parameters: @@ -548,21 +566,28 @@ def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d): norm_layer -- normalization layer """ super(NLayerDiscriminator, self).__init__() + if kws is None: + self.kws = [4 for _ in range(n_layers + 2)] + self.paddings = [1 for _ in range(n_layers + 2)] + elif paddings is None: + self.kws = kws + self.paddings = [1 for _ in range(n_layers + 2)] + else: + self.kws = kws + self.paddings = paddings if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters use_bias = norm_layer.func == nn.InstanceNorm2d else: use_bias = norm_layer == nn.InstanceNorm2d - kw = 4 - padw = 1 - sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)] + sequence = [nn.Conv2d(input_nc, ndf, kernel_size=self.kws[0], stride=2, padding=self.paddings[0]), nn.LeakyReLU(0.2, True)] nf_mult = 1 nf_mult_prev = 1 for n in range(1, n_layers): # gradually increase the number of filters nf_mult_prev = nf_mult nf_mult = min(2 ** n, 8) sequence += [ - nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias), + nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=self.kws[n], stride=2, padding=self.paddings[n], bias=use_bias), norm_layer(ndf * nf_mult), nn.LeakyReLU(0.2, True) ] @@ -570,12 +595,12 @@ def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d): nf_mult_prev = nf_mult nf_mult = min(2 ** n_layers, 8) sequence += [ - nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias), + nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=self.kws[n_layers], stride=1, padding=self.paddings[n_layers], bias=use_bias), norm_layer(ndf * nf_mult), nn.LeakyReLU(0.2, True) ] - sequence += [nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)] # output 1 channel prediction map + sequence += [nn.Conv2d(ndf * nf_mult, 1, kernel_size=self.kws[n_layers + 1], stride=1, padding=self.paddings[n_layers + 1])] # output 1 channel prediction map self.model = nn.Sequential(*sequence) def forward(self, input): diff --git a/models/pix2pix_model.py b/models/pix2pix_model.py index 939eb887ee3..a5fbd4c36ea 100644 --- a/models/pix2pix_model.py +++ b/models/pix2pix_model.py @@ -54,11 +54,11 @@ def __init__(self, opt): self.model_names = ['G'] # define networks (both generator and discriminator) self.netG = networks.define_G(opt.input_nc, opt.output_nc, opt.ngf, opt.netG, opt.norm, - not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids) + not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids, kws=opt.kwsG, paddings=opt.paddingsG) if self.isTrain: # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc self.netD = networks.define_D(opt.input_nc + opt.output_nc, opt.ndf, opt.netD, - opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids) + opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids, kws=opt.kwsD, paddings=opt.paddingsD) if self.isTrain: # define loss functions diff --git a/options/base_options.py b/options/base_options.py index afb5d0852d1..ba01d635a2b 100644 --- a/options/base_options.py +++ b/options/base_options.py @@ -33,6 +33,10 @@ def initialize(self, parser): parser.add_argument('--netD', type=str, default='basic', help='specify discriminator architecture [basic | n_layers | pixel]. The basic model is a 70x70 PatchGAN. n_layers allows you to specify the layers in the discriminator') parser.add_argument('--netG', type=str, default='resnet_9blocks', help='specify generator architecture [resnet_9blocks | resnet_6blocks | unet_256 | unet_128]') parser.add_argument('--n_layers_D', type=int, default=3, help='only used if netD==n_layers') + parser.add_argument('--kwsD', type=str, default='', help='n_layers + 2 numbers separated by underscores, corresponding to the discriminator kernel widths. default empty string will be interpreted as 4_4_..._4. Give a custom input as a string in the form 4_3_4_3_3_3 for example.') + parser.add_argument('--paddingsD', type=str, default='', help='n_layers + 2 numbers separated by underscores, corresponding to the discriminator padding amounts. Default empty string will be interpreted as 1_ ... _1. Give a custom input as a string in the form 2_1_1_1_2_2 for example.') + parser.add_argument('--kwsG', type=str, default='', help='The kernel widths for the generator blocks, starting from outermost in (only works for Unet currently). Must be length 8 for unet256, and length 7 for unet128. Default empty string is interpreted as 4_4_4_4_4_4_4_4 for unet256 and 4_4_4_4_4_4_4 for unet128') + parser.add_argument('--paddingsG', type=str, default='', help='The padding amounts for the generator blocks, starting from outermost in (only works for Unet currently). Must be length 8 for unet256, and length 7 for unet128. Default empty string is interpreted as 1_1_1_1_1_1_1_1 for unet256 and 1_1_1_1_1_1_1 for unet128') parser.add_argument('--norm', type=str, default='instance', help='instance normalization or batch normalization [instance | batch | none]') parser.add_argument('--init_type', type=str, default='normal', help='network initialization [normal | xavier | kaiming | orthogonal]') parser.add_argument('--init_gain', type=float, default=0.02, help='scaling factor for normal, xavier and orthogonal.') @@ -121,7 +125,19 @@ def parse(self): opt.name = opt.name + suffix self.print_options(opt) - + # decode kws and paddings strings + kwsD = opt.kwsD.split('_') + kwsD = None if kwsD == [''] else [int(value) for value in kwsD] + opt.kwsD = kwsD + kwsG = opt.kwsG.split('_') + kwsG = None if kwsG == [''] else [int(value) for value in kwsG] + opt.kwsG = kwsG + paddingsD = opt.paddingsD.split('_') + paddingsD = None if paddingsD == [''] else [int(value) for value in paddingsD] + opt.paddingsD = paddingsD + paddingsG = opt.paddingsG.split('_') + paddingsG = None if paddingsG == [''] else [int(value) for value in paddingsG] + opt.paddingsG = paddingsG # set gpu ids str_ids = opt.gpu_ids.split(',') opt.gpu_ids = [] diff --git a/prep_debabble.py b/prep_debabble.py new file mode 100644 index 00000000000..81839b54ecc --- /dev/null +++ b/prep_debabble.py @@ -0,0 +1,87 @@ +import sys + +sys.path.remove('/opt/ros/kinetic/lib/python2.7/dist-packages') +import os +import glob +import random +import librosa +import numpy as np +from tqdm import tqdm +#from tqdm.notebook import tqdm +import cv2 + +def melpectrogram(y, start_sample=0, n_samples=None, min_dB=-100,n_fft = 1024,hop_length=256,n_mels=80,fmin=40, fmax=8000): + if n_samples is None: n_samples = len(y) + y = y[start_sample:start_sample+n_samples] + #FFT + D = librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length) + D = np.abs(D) + #melspec + S = librosa.feature.melspectrogram(S=D, sr=r, n_fft=n_fft, n_mels=n_mels, fmin=fmin) + #amp to dB + S = 20 * np.log10(np.maximum(1e-5, S)) + #normalize + return np.clip((S - min_dB) / -min_dB, 0, 1) + +def mkpath(directory): + if not os.path.exists(directory): + os.makedirs(directory) + +noise_gain_str = '2.5' +n_mels = 256 +print('im here') +audio_directory = 'data/debabble/audio' +img_directory = f'data/debabble/mel_img{n_mels}_{noise_gain_str}' +mkpath(f'{img_directory}/A') +mkpath(f'{img_directory}/B') +#mkpath(f'{img_directory}/A/train') +#mkpath(f'{img_directory}/A/test') +#mkpath(f'{img_directory}/A/val') +#mkpath(f'{img_directory}/B/train') +#mkpath(f'{img_directory}/B/test') +#mkpath(f'{img_directory}/B/val') +files_A = glob.glob(f'{audio_directory}/A/*') + +print(f'{audio_directory}/A/*') +import re +p = re.compile('speaker_([0-9]+)') +print(files_A) +for i, f in enumerate(tqdm(files_A)): + speaker_num = int(p.search(f).group(1)) + fA = f + fB = f.replace('/A/', '/B/').replace('_clean.wav', f'_{noise_gain_str}_noise.wav') + yA, r = librosa.load(fA) + yB, r = librosa.load(fB) + melA = melpectrogram(yA, n_mels=n_mels) + melB = melpectrogram(yB, n_mels=n_mels) + # chop and save images + n_specs = melA.shape[1] + m = melA + for j in range(n_specs // n_mels): + slice_start = j * n_mels + slice_end = slice_start + n_mels + mA_slice = melA[:,slice_start:slice_end] + mB_slice = melB[:,slice_start:slice_end] + cv2.imwrite(f'{img_directory}/A/mel_{noise_gain_str}_{i:06d}_s{speaker_num:03d}_{j:02d}.png', mA_slice * 255.) + cv2.imwrite(f'{img_directory}/B/mel_{noise_gain_str}_{i:06d}_s{speaker_num:03d}_{j:02d}.png', mB_slice * 255.) + +raise +imgs_A = glob.glob(f'{img_directory}/A/*.png') +n_images = len(imgs_A) +random.shuffle(imgs_A) +n_val = n_images // 10 +n_test = n_images // 100 +print(imgs_A) +for i, fA in enumerate(tqdm(imgs_A)): + fB = fA.replace('/A/', '/B/') + if i < n_test: + os.rename(fA, fA.replace('mel_0', 'test/mel_0')) + os.rename(fB, fB.replace('mel_0', 'test/mel_0')) + elif i < n_val: + os.rename(fA, fA.replace('mel_0', 'val/mel_0')) + os.rename(fB, fB.replace('mel_0', 'val/mel_0')) + else: + os.rename(fA, fA.replace('mel_0', 'train/mel_0')) + os.rename(fB, fB.replace('mel_0', 'train/mel_0')) + +print('ran')