From f38b4a317f2c5f1fd2d40668fe7be48099359d66 Mon Sep 17 00:00:00 2001 From: Krishna Subramani <subramani.krishna97@gmail.com> Date: Mon, 25 Sep 2023 00:19:41 -0400 Subject: [PATCH] Python code for neural pitch --- dnn/torch/neural-pitch/README.md | 18 + dnn/torch/neural-pitch/data_augmentation.py | 149 ++++++ dnn/torch/neural-pitch/download_demand.sh | 43 ++ dnn/torch/neural-pitch/evaluation.py | 464 ++++++++++++++++++ dnn/torch/neural-pitch/experiments.py | 38 ++ .../export_neuralpitch_weights.py | 89 ++++ dnn/torch/neural-pitch/models.py | 218 ++++++++ dnn/torch/neural-pitch/neural_pitch_update.py | 207 ++++++++ dnn/torch/neural-pitch/ptdb_process.sh | 34 ++ dnn/torch/neural-pitch/training.py | 162 ++++++ dnn/torch/neural-pitch/utils.py | 59 +++ 11 files changed, 1481 insertions(+) create mode 100644 dnn/torch/neural-pitch/README.md create mode 100644 dnn/torch/neural-pitch/data_augmentation.py create mode 100644 dnn/torch/neural-pitch/download_demand.sh create mode 100644 dnn/torch/neural-pitch/evaluation.py create mode 100644 dnn/torch/neural-pitch/experiments.py create mode 100644 dnn/torch/neural-pitch/export_neuralpitch_weights.py create mode 100644 dnn/torch/neural-pitch/models.py create mode 100644 dnn/torch/neural-pitch/neural_pitch_update.py create mode 100644 dnn/torch/neural-pitch/ptdb_process.sh create mode 100644 dnn/torch/neural-pitch/training.py create mode 100644 dnn/torch/neural-pitch/utils.py diff --git a/dnn/torch/neural-pitch/README.md b/dnn/torch/neural-pitch/README.md new file mode 100644 index 000000000..6323ead5d --- /dev/null +++ b/dnn/torch/neural-pitch/README.md @@ -0,0 +1,18 @@ +## Neural Pitch Estimation + +- Dataset Installation + 1. Download and unzip PTDB Dataset: + wget https://www2.spsc.tugraz.at/databases/PTDB-TUG/SPEECH_DATA_ZIPPED.zip + unzip SPEECH_DATA_ZIPPED.zip + + 2. Inside "SPEECH DATA" above, run ptdb_process.sh to combine male/female + + 3. To Download and combine demand, simply run download_demand.sh + +- LPCNet preparation + 1. To extract xcorr, add lpcnet_extractor.c and add relevant functions to lpcnet_enc.c, add source for headers/c files and Makefile.am, and compile to generate ./lpcnet_xcorr_extractor object + +- Dataset Augmentation and training (check out arguments to each of the following) + 1. Run data_augmentation.py + 2. Run training.py using augmented data + 3. Run experiments.py diff --git a/dnn/torch/neural-pitch/data_augmentation.py b/dnn/torch/neural-pitch/data_augmentation.py new file mode 100644 index 000000000..ee7a3cab6 --- /dev/null +++ b/dnn/torch/neural-pitch/data_augmentation.py @@ -0,0 +1,149 @@ +""" +Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data +1. Read in chunks and compute clean pitch first +2. Then add in augmentation (Noise/Level/Response) + - Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk + - When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training +3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input + +Notes: To ensure consistency with the discovered CREPE offset, we do the following +- We pad the input audio to the zero-centered CREPE estimator with 80 zeros +- We pad the input audio to our feature computation with 160 zeros to center them +""" + +import argparse +parser = argparse.ArgumentParser() + +parser.add_argument('data', type=str, help='input raw audio data') +parser.add_argument('output', type=str, help='output directory') +parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)') +parser.add_argument('noise_dataset', type=str, help='Location of the Demand Datset') +parser.add_argument('--flag_xcorr', type=bool, help='Flag to additionally dump xcorr features',choices=[True,False],default = False,required = False) +parser.add_argument('--fraction_input_use', type=float, help='Fraction of input data to consider',default = 0.3,required = False) +parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False) +parser.add_argument('--choice_augment', type=str, help='Choice of noise augmentation, either use additive synthetic noise or add noise from the demand dataset',choices = ['demand','synthetic'],default = "demand",required = False) +parser.add_argument('--fraction_clean', type=float, help='Fraction of data to keep clean (that is not augment with anything)',default = 0.2,required = False) +parser.add_argument('--chunk_size', type=int, help='Number of samples to augment with for each iteration',default = 80000,required = False) +parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False) +parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False) +parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False) + +args = parser.parse_args() + +import os +os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index) + +from utils import stft, random_filter + +import numpy as np +import tqdm +import crepe +import random +import glob +import subprocess + +data_full = np.memmap(args.data, dtype=np.int16,mode = 'r') +data = data_full[:(int)(args.fraction_input_use*data_full.shape[0])] + +# list_features = [] +list_cents = [] +list_confidences = [] + +N = args.N +H = args.H +freq_keep = args.freq_keep +# Minimum/Maximum periods, decided by LPCNet +min_period = 32 +max_period = 256 +f_ref = 16000/max_period +chunk_size = args.chunk_size +num_frames_chunk = chunk_size//H +list_indices_keep = np.concatenate([np.arange(freq_keep), (N//2 + 1) + np.arange(freq_keep), 2*(N//2 + 1) + np.arange(freq_keep)]) + +output_IF = np.memmap(args.output + '_iffeat.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,list_indices_keep.shape[0]), mode='w+') +if args.flag_xcorr: + output_xcorr = np.memmap(args.output + '_xcorr.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,257), mode='w+') + +fraction_clean = args.fraction_clean + +noise_dataset = args.noise_dataset + +for i in tqdm.trange((data.shape[0]//chunk_size - 1)//1): + chunk = data[i*chunk_size:(i + 1)*chunk_size]/(2**15 - 1) + + # Clean Pitch/Confidence Estimate + # Padding input to CREPE by 80 samples to ensure it aligns + _, pitch, confidence, _ = crepe.predict(np.concatenate([np.zeros(80),chunk]), 16000, center=True, viterbi=True,verbose=0) + cent = 1200*np.log2(np.divide(pitch, f_ref, out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8) + + # Filter out of range pitches/confidences + confidence[pitch < 16000/max_period] = 0 + confidence[pitch > 16000/min_period] = 0 + + # Keep fraction of data clean, augment only 1 minus the fraction + if (np.random.rand() > fraction_clean): + # Response, generate controlled/random 2nd order IIR filter and filter chunk + chunk = random_filter(chunk) + + # Level/Gain response {scale by random gain between 1.0e-3 and 10} + # Generate random gain in dB and then convert to scale + g_dB = np.random.uniform(low = -60, high = 20, size = 1) + # g_dB = 0 + g = 10**(g_dB/20) + + # Noise Addition {Add random SNR 2nd order randomly colored noise} + # Generate noise SNR value and add corresponding noise + snr_dB = np.random.uniform(low = -20, high = 30, size = 1) + + if args.choice_augment == 'synthetic': + n = np.random.randn(chunk_size) + else: + list_noisefiles = noise_dataset + '*.wav' + noise_file = random.choice(glob.glob(list_noisefiles)) + n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1) + rand_range = np.random.randint(low = 0, high = (n.shape[0] - 16000*60 - chunk.shape[0])) # 16000 is subtracted because we will use the last 1 minutes of noise for testing + n = n[rand_range:rand_range + chunk.shape[0]] + + # Randomly filter the sampled noise as well + n = random_filter(n) + # generate random prime number between 0,500 and make those samples of noise 0 (to prevent GRU from picking up temporal patterns) + Nprime = random.choice([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541]) + n[chunk_size - Nprime:] = np.zeros(Nprime) + snr_multiplier = np.sqrt((np.sum(np.abs(chunk)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10)) + + chunk = g*(chunk + snr_multiplier*n) + + # Zero pad input audio by 160 to center the frames + spec = stft(x = np.concatenate([np.zeros(160),chunk]), w = 'boxcar', N = N, H = H).T + phase_diff = spec*np.conj(np.roll(spec,1,axis = -1)) + phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8) + feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T + feature = feature[:,list_indices_keep] + + if args.flag_xcorr: + # Dump noisy audio into temp file + data_temp = np.memmap('./temp_augment.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+') + # data_temp[:chunk.shape[0]] = (chunk/(np.max(np.abs(chunk)))*(2**15 - 1)).astype(np.int16) + data_temp[:chunk.shape[0]] = ((chunk)*(2**15 - 1)).astype(np.int16) + + subprocess.run([args.path_lpcnet_extractor, './temp_augment.raw', './temp_augment_xcorr.f32']) + feature_xcorr = np.flip(np.fromfile('./temp_augment_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1) + ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1) + feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1) + + os.remove('./temp_augment.raw') + os.remove('./temp_augment_xcorr.f32') + num_frames = min(cent.shape[0],feature.shape[0],feature_xcorr.shape[0],num_frames_chunk) + feature = feature[:num_frames,:] + cent = cent[:num_frames] + confidence = confidence[:num_frames] + feature_xcorr = feature_xcorr[:num_frames] + output_IF[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature + output_xcorr[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature_xcorr + list_cents.append(cent) + list_confidences.append(confidence) + +list_cents = np.hstack(list_cents) +list_confidences = np.hstack(list_confidences) + +np.save(args.output + '_pitches',np.vstack([list_cents,list_confidences])) diff --git a/dnn/torch/neural-pitch/download_demand.sh b/dnn/torch/neural-pitch/download_demand.sh new file mode 100644 index 000000000..0cff06af4 --- /dev/null +++ b/dnn/torch/neural-pitch/download_demand.sh @@ -0,0 +1,43 @@ +wget https://zenodo.org/record/1227121/files/DKITCHEN_16k.zip + +wget https://zenodo.org/record/1227121/files/DLIVING_16k.zip + +wget https://zenodo.org/record/1227121/files/DWASHING_16k.zip + +wget https://zenodo.org/record/1227121/files/NFIELD_16k.zip + +wget https://zenodo.org/record/1227121/files/NPARK_16k.zip + +wget https://zenodo.org/record/1227121/files/NRIVER_16k.zip + +wget https://zenodo.org/record/1227121/files/OHALLWAY_16k.zip + +wget https://zenodo.org/record/1227121/files/OMEETING_16k.zip + +wget https://zenodo.org/record/1227121/files/OOFFICE_16k.zip + +wget https://zenodo.org/record/1227121/files/PCAFETER_16k.zip + +wget https://zenodo.org/record/1227121/files/PRESTO_16k.zip + +wget https://zenodo.org/record/1227121/files/PSTATION_16k.zip + +wget https://zenodo.org/record/1227121/files/TMETRO_16k.zip + +wget https://zenodo.org/record/1227121/files/TCAR_16k.zip + +wget https://zenodo.org/record/1227121/files/TBUS_16k.zip + +wget https://zenodo.org/record/1227121/files/STRAFFIC_16k.zip + +wget https://zenodo.org/record/1227121/files/SPSQUARE_16k.zip + +unzip '*.zip' + +mkdir -p ./combined_demand_channels/ +for file in */*.wav; do +parentdir="$(dirname "$file")" +echo $parentdir +fname="$(basename "$file")" +cp $file ./combined_demand_channels/$parentdir+$fname +done diff --git a/dnn/torch/neural-pitch/evaluation.py b/dnn/torch/neural-pitch/evaluation.py new file mode 100644 index 000000000..0369cafaf --- /dev/null +++ b/dnn/torch/neural-pitch/evaluation.py @@ -0,0 +1,464 @@ +""" +Evaluation script to compute the Raw Pitch Accuracy +Procedure: + - Look at all voiced frames in file + - Compute number of pitches in those frames that lie within a 50 cent threshold + RPA = (Total number of pitches within threshold summed across all files)/(Total number of voiced frames summed accross all files) +""" + +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +from prettytable import PrettyTable +import numpy as np +import glob +import random +import tqdm +import torch +import librosa +import json +from utils import stft, random_filter, feature_xform +import subprocess +import crepe + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def rca(reference,input,voicing,thresh = 25): + idx_voiced = np.where(voicing != 0)[0] + acc = np.where(np.abs(reference - input)[idx_voiced] < thresh)[0] + return acc.shape[0] + +def sweep_rca(reference,input,voicing,thresh = 25,ind_arr = np.arange(-10,10)): + l = [] + for i in ind_arr: + l.append(rca(reference,np.roll(input,i),voicing,thresh)) + l = np.array(l) + + return np.max(l) + +def rpa(model,device = 'cpu',data_format = 'if'): + list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw') + dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/' + # random_shuffle = list(np.random.permutation(len(list_files))) + random.shuffle(list_files) + list_files = list_files[:1000] + + # C_lp = 0 + # C_lp_m = 0 + # C_lp_f = 0 + # list_rca_model_lp = [] + # list_rca_male_lp = [] + # list_rca_female_lp = [] + + # C_hp = 0 + # C_hp_m = 0 + # C_hp_f = 0 + # list_rca_model_hp = [] + # list_rca_male_hp = [] + # list_rca_female_hp = [] + + C_all = 0 + C_all_m = 0 + C_all_f = 0 + list_rca_model_all = [] + list_rca_male_all = [] + list_rca_female_all = [] + + thresh = 50 + N = 320 + H = 160 + freq_keep = 30 + + for idx in tqdm.trange(len(list_files)): + audio_file = list_files[idx] + file_name = os.path.basename(list_files[idx])[:-4] + + audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1) + offset = 432 + audio = audio[offset:] + rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160)) + + spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T + phase_diff = spec*np.conj(np.roll(spec,1,axis = -1)) + phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8) + idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)]) + feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T + feature_if = feature[:,idx_save] + + data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+') + data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16) + + subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32']) + feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1) + ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1) + feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1) + # feature_xcorr = feature_xform(feature_xcorr) + + os.remove('./temp.raw') + os.remove('./temp_xcorr.f32') + + if data_format == 'if': + feature = feature_if + elif data_format == 'xcorr': + feature = feature_xcorr + else: + indmin = min(feature_if.shape[0],feature_xcorr.shape[0]) + feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1) + + + pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0" + pitch = np.loadtxt(pitch_file_name)[:,0] + voicing = np.loadtxt(pitch_file_name)[:,1] + indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0]) + pitch = pitch[:indmin] + voicing = voicing[:indmin] + rmse = rmse[:indmin] + voicing = voicing*(rmse > 0.05*np.max(rmse)) + if "mic_F" in audio_file: + idx_correct = np.where(pitch < 125) + voicing[idx_correct] = 0 + + cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int') + + # if (model == 'penn'): + # model_frequency, _ = penn.from_audio( + # torch.from_numpy(audio).unsqueeze(0).float(), + # 16000, + # hopsize=0.01, + # fmin=(16000.0/256), + # fmax=500, + # checkpoint=penn.DEFAULT_CHECKPOINT, + # batch_size=32, + # pad=True, + # interp_unvoiced_at=0.065, + # gpu=0) + # model_frequency = model_frequency.cpu().detach().squeeze().numpy() + # model_cents = 1200*np.log2(model_frequency/(16000/256)) + + # elif (model == 'crepe'): + # _, model_frequency, _, _ = crepe.predict(audio, 16000, viterbi=vflag,center=True,verbose=0) + # lpcnet_file_name = '/home/ubuntu/Code/Datasets/SPEECH_DATA/lpcnet_f0_16k_residual/' + file_name + '_f0.f32' + # period_lpcnet = np.fromfile(lpcnet_file_name, dtype='float32') + # model_frequency = 16000/(period_lpcnet + 1.0e-6) + # model_cents = 1200*np.log2(model_frequency/(16000/256)) + # else: + model_cents = model(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device)) + model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy() + # model_cents = np.roll(model_cents,-1*3) + + num_frames = min(cent.shape[0],model_cents.shape[0]) + pitch = pitch[:num_frames] + cent = cent[:num_frames] + voicing = voicing[:num_frames] + model_cents = model_cents[:num_frames] + + voicing_all = np.copy(voicing) + # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model + force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True) + voicing_all[force_out_of_pitch] = 0 + C_all = C_all + np.where(voicing_all != 0)[0].shape[0] + + # list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0])) + list_rca_model_all.append(rca(cent,model_cents,voicing_all,thresh)) + # list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents)))) + + if "mic_M" in audio_file: + # list_rca_male_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0])) + list_rca_male_all.append(rca(cent,model_cents,voicing_all,thresh)) + C_all_m = C_all_m + np.where(voicing_all != 0)[0].shape[0] + else: + # list_rca_female_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0])) + list_rca_female_all.append(rca(cent,model_cents,voicing_all,thresh)) + C_all_f = C_all_f + np.where(voicing_all != 0)[0].shape[0] + + """ + # Low pitch estimation + voicing_lp = np.copy(voicing) + force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 125)==True) + voicing_lp[force_out_of_pitch] = 0 + C_lp = C_lp + np.where(voicing_lp != 0)[0].shape[0] + + # list_rca_model_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0])) + list_rca_model_lp.append(rca(cent,model_cents,voicing_lp,thresh)) + + if "mic_M" in audio_file: + # list_rca_male_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0])) + list_rca_male_lp.append(rca(cent,model_cents,voicing_lp,thresh)) + C_lp_m = C_lp_m + np.where(voicing_lp != 0)[0].shape[0] + else: + # list_rca_female_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0])) + list_rca_female_lp.append(rca(cent,model_cents,voicing_lp,thresh)) + C_lp_f = C_lp_f + np.where(voicing_lp != 0)[0].shape[0] + + # High pitch estimation + voicing_hp = np.copy(voicing) + force_out_of_pitch = np.where(np.logical_or(pitch < 125,pitch > 500)==True) + voicing_hp[force_out_of_pitch] = 0 + C_hp = C_hp + np.where(voicing_hp != 0)[0].shape[0] + + # list_rca_model_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0])) + list_rca_model_hp.append(rca(cent,model_cents,voicing_hp,thresh)) + + if "mic_M" in audio_file: + # list_rca_male_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0])) + list_rca_male_hp.append(rca(cent,model_cents,voicing_hp,thresh)) + C_hp_m = C_hp_m + np.where(voicing_hp != 0)[0].shape[0] + else: + # list_rca_female_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0])) + list_rca_female_hp.append(rca(cent,model_cents,voicing_hp,thresh)) + C_hp_f = C_hp_f + np.where(voicing_hp != 0)[0].shape[0] + # list_rca_model.append(acc_model) + # list_rca_crepe.append(acc_crepe) + # list_rca_lpcnet.append(acc_lpcnet) + # list_rca_penn.append(acc_penn) + """ + + # list_rca_crepe = np.array(list_rca_crepe) + # list_rca_model_lp = np.array(list_rca_model_lp) + # list_rca_male_lp = np.array(list_rca_male_lp) + # list_rca_female_lp = np.array(list_rca_female_lp) + + # list_rca_model_hp = np.array(list_rca_model_hp) + # list_rca_male_hp = np.array(list_rca_male_hp) + # list_rca_female_hp = np.array(list_rca_female_hp) + + list_rca_model_all = np.array(list_rca_model_all) + list_rca_male_all = np.array(list_rca_male_all) + list_rca_female_all = np.array(list_rca_female_all) + # list_rca_lpcnet = np.array(list_rca_lpcnet) + # list_rca_penn = np.array(list_rca_penn) + + x = PrettyTable() + + x.field_names = ["Experiment", "Mean RPA"] + x.add_row(["Both all pitches", np.sum(list_rca_model_all)/C_all]) + # x.add_row(["Both low pitches", np.sum(list_rca_model_lp)/C_lp]) + # x.add_row(["Both high pitches", np.sum(list_rca_model_hp)/C_hp]) + + x.add_row(["Male all pitches", np.sum(list_rca_male_all)/C_all_m]) + # x.add_row(["Male low pitches", np.sum(list_rca_male_lp)/C_lp_m]) + # x.add_row(["Male high pitches", np.sum(list_rca_male_hp)/C_hp_m]) + + x.add_row(["Female all pitches", np.sum(list_rca_female_all)/C_all_f]) + # x.add_row(["Female low pitches", np.sum(list_rca_female_lp)/C_lp_f]) + # x.add_row(["Female high pitches", np.sum(list_rca_female_hp)/C_hp_f]) + + print(x) + + return None + +def cycle_eval(list_files_pth, noise_type = 'synthetic', noise_dataset = None, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = None,fraction = 0.1,thresh = 50): + """ + Cycle through SNR evaluation for list of .pth files + """ + # list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw') + # dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/' + # random_shuffle = list(np.random.permutation(len(list_files))) + list_files = glob.glob(ptdb_dataset_path + 'combined_mic_16k/*.raw') + dir_f0 = ptdb_dataset_path + 'combined_reference_f0/' + random.shuffle(list_files) + list_files = list_files[:(int)(fraction*len(list_files))] + + # list_nfiles = ['DKITCHEN','NFIELD','OHALLWAY','PCAFETER','SPSQUARE','TCAR','DLIVING','NPARK','OMEETING','PRESTO','STRAFFIC','TMETRO','DWASHING','NRIVER','OOFFICE','PSTATION','TBUS'] + + dict_models = {} + list_snr.append(np.inf) + # thresh = 50 + + for f in list_files_pth: + if (f!='crepe') and (f!='lpcnet'): + fname = os.path.basename(f).split('_')[0] + '_' + os.path.basename(f).split('_')[-1][:-4] + config_path = os.path.dirname(f) + '/' + os.path.basename(f).split('_')[0] + '_' + 'config_' + os.path.basename(f).split('_')[-1][:-4] + '.json' + with open(config_path) as json_file: + dict_params = json.load(json_file) + + if dict_params['data_format'] == 'if': + from models import large_if_ccode as model + pitch_nn = model(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']).to(device) + elif dict_params['data_format'] == 'xcorr': + from models import large_xcorr as model + pitch_nn = model(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device) + else: + from models import large_joint as model + pitch_nn = model(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device) + + pitch_nn.load_state_dict(torch.load(f)) + + N = dict_params['window_size'] + H = dict_params['hop_factor'] + freq_keep = dict_params['freq_keep'] + + list_mean = [] + list_std = [] + for snr_dB in list_snr: + C_all = 0 + C_correct = 0 + for idx in tqdm.trange(len(list_files)): + audio_file = list_files[idx] + file_name = os.path.basename(list_files[idx])[:-4] + + audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1) + offset = 432 + audio = audio[offset:] + rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = N,hop_length = H)) + + if noise_type != 'synthetic': + list_noisefiles = noise_dataset + '*.wav' + noise_file = random.choice(glob.glob(list_noisefiles)) + n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1) + rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing + n = n[rand_range:rand_range + audio.shape[0]] + else: + n = np.random.randn(audio.shape[0]) + n = random_filter(n) + + snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10)) + audio = audio + snr_multiplier*n + + spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T + phase_diff = spec*np.conj(np.roll(spec,1,axis = -1)) + phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8) + idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)]) + feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T + feature_if = feature[:,idx_save] + + data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+') + # data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16) + data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16) + + subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32']) + feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1) + ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1) + feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1) + + os.remove('./temp.raw') + os.remove('./temp_xcorr.f32') + + if dict_params['data_format'] == 'if': + feature = feature_if + elif dict_params['data_format'] == 'xcorr': + feature = feature_xcorr + else: + indmin = min(feature_if.shape[0],feature_xcorr.shape[0]) + feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1) + + pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0" + pitch = np.loadtxt(pitch_file_name)[:,0] + voicing = np.loadtxt(pitch_file_name)[:,1] + indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0]) + pitch = pitch[:indmin] + voicing = voicing[:indmin] + rmse = rmse[:indmin] + voicing = voicing*(rmse > 0.05*np.max(rmse)) + if "mic_F" in audio_file: + idx_correct = np.where(pitch < 125) + voicing[idx_correct] = 0 + + cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int') + + # if os.path.basename(f) == 'crepe': + # elif (model == 'crepe'): + # _, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0) + # model_cents = 1200*np.log2(model_frequency/(16000/256)) + # else: + # else: + model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device)) + model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy() + # model_cents = np.roll(model_cents,-1*3) + + num_frames = min(cent.shape[0],model_cents.shape[0]) + pitch = pitch[:num_frames] + cent = cent[:num_frames] + voicing = voicing[:num_frames] + model_cents = model_cents[:num_frames] + + voicing_all = np.copy(voicing) + # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model + force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True) + voicing_all[force_out_of_pitch] = 0 + C_all = C_all + np.where(voicing_all != 0)[0].shape[0] + + # list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0])) + C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh) + # list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents)))) + list_mean.append(C_correct/C_all) + else: + fname = f + list_mean = [] + list_std = [] + for snr_dB in list_snr: + C_all = 0 + C_correct = 0 + for idx in tqdm.trange(len(list_files)): + audio_file = list_files[idx] + file_name = os.path.basename(list_files[idx])[:-4] + + audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1) + offset = 432 + audio = audio[offset:] + rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160)) + + if noise_type != 'synthetic': + list_noisefiles = noise_dataset + '*.wav' + noise_file = random.choice(glob.glob(list_noisefiles)) + n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1) + rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing + n = n[rand_range:rand_range + audio.shape[0]] + else: + n = np.random.randn(audio.shape[0]) + n = random_filter(n) + + snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10)) + audio = audio + snr_multiplier*n + + if (f == 'crepe'): + _, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0) + model_cents = 1200*np.log2(model_frequency/(16000/256) + 1.0e-8) + else: + data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+') + # data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16) + data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16) + + subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32', './temp_period.f32']) + feature_xcorr = np.fromfile('./temp_period.f32', dtype='float32') + model_cents = 1200*np.log2((256/feature_xcorr + 1.0e-8) + 1.0e-8) + + os.remove('./temp.raw') + os.remove('./temp_xcorr.f32') + os.remove('./temp_period.f32') + + + pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0" + pitch = np.loadtxt(pitch_file_name)[:,0] + voicing = np.loadtxt(pitch_file_name)[:,1] + indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0]) + pitch = pitch[:indmin] + voicing = voicing[:indmin] + rmse = rmse[:indmin] + voicing = voicing*(rmse > 0.05*np.max(rmse)) + if "mic_F" in audio_file: + idx_correct = np.where(pitch < 125) + voicing[idx_correct] = 0 + + cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int') + num_frames = min(cent.shape[0],model_cents.shape[0]) + pitch = pitch[:num_frames] + cent = cent[:num_frames] + voicing = voicing[:num_frames] + model_cents = model_cents[:num_frames] + + voicing_all = np.copy(voicing) + # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model + force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True) + voicing_all[force_out_of_pitch] = 0 + C_all = C_all + np.where(voicing_all != 0)[0].shape[0] + + # list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0])) + C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh) + # list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents)))) + list_mean.append(C_correct/C_all) + dict_models[fname] = {} + dict_models[fname]['list_SNR'] = list_mean[:-1] + dict_models[fname]['inf'] = list_mean[-1] + + return dict_models diff --git a/dnn/torch/neural-pitch/experiments.py b/dnn/torch/neural-pitch/experiments.py new file mode 100644 index 000000000..bc8ea7e3c --- /dev/null +++ b/dnn/torch/neural-pitch/experiments.py @@ -0,0 +1,38 @@ +""" +Running the experiments; + 1. RCA vs SNR for our models, CREPE, LPCNet +""" + +import argparse +parser = argparse.ArgumentParser() + +parser.add_argument('ptdb_root', type=str, help='Root Directory for PTDB generated by running ptdb_process.sh ') +parser.add_argument('output', type=str, help='Output dump file name') +parser.add_argument('method', type=str, help='Output Directory to save experiment dumps',choices=['model','lpcnet','crepe']) +parser.add_argument('--noise_dataset', type=str, help='Location of the Demand Datset',default = './',required=False) +parser.add_argument('--noise_type', type=str, help='Type of additive noise',default = 'synthetic',choices=['synthetic','demand'],required=False) +parser.add_argument('--pth_file', type=str, help='.pth file to analyze',default = './',required = False) +parser.add_argument('--fraction_files_analyze', type=float, help='Fraction of PTDB dataset to test on',default = 1,required = False) +parser.add_argument('--threshold_rca', type=float, help='Cent threshold when computing RCA',default = 50,required = False) +parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False) + +args = parser.parse_args() + +import os +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index) + +import json +from evaluation import cycle_eval + +if args.method == 'model': + dict_store = cycle_eval([args.pth_file], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca) +else: + dict_store = cycle_eval([args.method], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca) + +dict_store["method"] = args.method +if args.method == 'model': + dict_store['pth'] = args.pth_file + +with open(args.output, 'w') as fp: + json.dump(dict_store, fp) diff --git a/dnn/torch/neural-pitch/export_neuralpitch_weights.py b/dnn/torch/neural-pitch/export_neuralpitch_weights.py new file mode 100644 index 000000000..be3742813 --- /dev/null +++ b/dnn/torch/neural-pitch/export_neuralpitch_weights.py @@ -0,0 +1,89 @@ +""" +/* Copyright (c) 2022 Amazon + Written by Jan Buethe */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +""" + +import os +import argparse +import sys + +sys.path.append(os.path.join(os.path.dirname(__file__), '../weight-exchange')) + + +parser = argparse.ArgumentParser() + +parser.add_argument('checkpoint', type=str, help='rdovae model checkpoint') +parser.add_argument('output_dir', type=str, help='output folder') + +args = parser.parse_args() + +import torch +import numpy as np + +from models import large_if_ccode +from wexchange.torch import dump_torch_weights +from wexchange.c_export import CWriter, print_vector + +def c_export(args, model): + + message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}" + + enc_writer = CWriter(os.path.join(args.output_dir, "neural_pitch_data"), message=message, model_struct_name='nnpitch') + enc_writer.header.write( +f""" +#include "opus_types.h" +""" + ) + + + # encoder + encoder_dense_layers = [ + ('initial' , 'initial', 'TANH'), + ('upsample' , 'upsample', 'TANH') + ] + + for name, export_name, _ in encoder_dense_layers: + layer = model.get_submodule(name) + dump_torch_weights(enc_writer, layer, name=export_name, verbose=True) + + + encoder_gru_layers = [ + ('gru' , 'gru', 'TANH'), + ] + + enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, verbose=True, input_sparse=False, quantize=False) + for name, export_name, _ in encoder_gru_layers]) + + del enc_writer + + +if __name__ == "__main__": + + os.makedirs(args.output_dir, exist_ok=True) + model = large_if_ccode() + model.load_state_dict(torch.load(args.checkpoint,map_location='cpu')) + c_export(args, model) diff --git a/dnn/torch/neural-pitch/models.py b/dnn/torch/neural-pitch/models.py new file mode 100644 index 000000000..426f53ce9 --- /dev/null +++ b/dnn/torch/neural-pitch/models.py @@ -0,0 +1,218 @@ +""" +Pitch Estimation Models and dataloaders + - Classification Based (Input features, output logits) +""" + +import torch +import numpy as np + +class large_if_ccode(torch.nn.Module): + + def __init__(self,input_dim = 90,gru_dim = 64,output_dim = 192): + super(large_if_ccode,self).__init__() + + self.activation = torch.nn.Tanh() + self.initial = torch.nn.Linear(input_dim,gru_dim) + self.hidden = torch.nn.Linear(gru_dim,gru_dim) + self.gru = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,batch_first = True) + self.upsample = torch.nn.Linear(gru_dim,output_dim) + + def forward(self, x): + + x = self.initial(x) + x = self.activation(x) + x = self.hidden(x) + x = self.activation(x) + x,_ = self.gru(x) + x = self.upsample(x) + x = self.activation(x) + x = x.permute(0,2,1) + + return x + +class large_xcorr(torch.nn.Module): + + def __init__(self,input_dim = 90,gru_dim = 64,output_dim = 192): + super(large_xcorr,self).__init__() + + self.activation = torch.nn.Tanh() + + self.conv = torch.nn.Sequential( + torch.nn.ZeroPad2d((2,0,1,1)), + torch.nn.Conv2d(1, 8, 3, bias = True), + self.activation, + torch.nn.ZeroPad2d((2,0,1,1)), + torch.nn.Conv2d(8, 8, 3, bias = True), + self.activation, + torch.nn.ZeroPad2d((2,0,1,1)), + torch.nn.Conv2d(8, 1, 3, bias = True), + self.activation, + ) + + # self.conv = torch.nn.Sequential( + # torch.nn.ConstantPad1d((2,0),0), + # torch.nn.Conv1d(64,10,3), + # self.activation, + # torch.nn.ConstantPad1d((2,0),0), + # torch.nn.Conv1d(10,64,3), + # self.activation, + # ) + + self.downsample = torch.nn.Sequential( + torch.nn.Linear(input_dim,gru_dim), + self.activation + ) + self.GRU = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,num_layers = 1,batch_first = True) + self.upsample = torch.nn.Sequential( + torch.nn.Linear(gru_dim,output_dim), + self.activation + ) + + def forward(self, x): + # x = x[:,:,:257].unsqueeze(-1) + x = self.conv(x.unsqueeze(-1).permute(0,3,2,1)).squeeze(1) + # print(x.shape) + # x = self.conv(x.permute(0,3,2,1)).squeeze(1) + x,_ = self.GRU(self.downsample(x.permute(0,2,1))) + x = self.upsample(x).permute(0,2,1) + + # x = self.downsample(x) + # x = self.activation(x) + # x = self.conv(x.permute(0,2,1)).permute(0,2,1) + # x,_ = self.GRU(x) + # x = self.upsample(x).permute(0,2,1) + return x + +class large_joint(torch.nn.Module): + """ + Joint IF-xcorr + 1D CNN on IF, merge with xcorr, 2D CNN on merged + GRU + """ + + def __init__(self,input_IF_dim = 90,input_xcorr_dim = 257,gru_dim = 64,output_dim = 192): + super(large_joint,self).__init__() + + self.activation = torch.nn.Tanh() + + self.if_upsample = torch.nn.Sequential( + torch.nn.Linear(input_IF_dim,64), + self.activation, + torch.nn.Linear(64,64), + self.activation, + ) + + # self.if_upsample = torch.nn.Sequential( + # torch.nn.ConstantPad1d((2,0),0), + # torch.nn.Conv1d(90,10,3), + # self.activation, + # torch.nn.ConstantPad1d((2,0),0), + # torch.nn.Conv1d(10,257,3), + # self.activation, + # ) + + self.conv = torch.nn.Sequential( + torch.nn.ZeroPad2d((2,0,1,1)), + torch.nn.Conv2d(1, 8, 3, bias = True), + self.activation, + torch.nn.ZeroPad2d((2,0,1,1)), + torch.nn.Conv2d(8, 8, 3, bias = True), + self.activation, + torch.nn.ZeroPad2d((2,0,1,1)), + torch.nn.Conv2d(8, 1, 3, bias = True), + self.activation, + ) + + # self.conv = torch.nn.Sequential( + # torch.nn.ConstantPad1d((2,0),0), + # torch.nn.Conv1d(257,10,3), + # self.activation, + # torch.nn.ConstantPad1d((2,0),0), + # torch.nn.Conv1d(10,64,3), + # self.activation, + # ) + + self.downsample = torch.nn.Sequential( + torch.nn.Linear(64 + input_xcorr_dim,gru_dim), + self.activation + ) + self.GRU = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,num_layers = 1,batch_first = True) + self.upsample = torch.nn.Sequential( + torch.nn.Linear(gru_dim,output_dim), + self.activation + ) + + def forward(self, x): + xcorr_feat = x[:,:,:257] + if_feat = x[:,:,257:] + # x = torch.cat([xcorr_feat.unsqueeze(-1),self.if_upsample(if_feat).unsqueeze(-1)],axis = -1) + xcorr_feat = self.conv(xcorr_feat.unsqueeze(-1).permute(0,3,2,1)).squeeze(1).permute(0,2,1) + if_feat = self.if_upsample(if_feat) + x = torch.cat([xcorr_feat,if_feat],axis = - 1) + # x = self.conv(x.permute(0,3,2,1)).squeeze(1) + x,_ = self.GRU(self.downsample(x)) + x = self.upsample(x).permute(0,2,1) + + return x + + +# Dataloaders +class loader(torch.utils.data.Dataset): + def __init__(self, features_if, file_pitch,confidence_threshold = 0.4,dimension_if = 30,context = 100): + self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,3*dimension_if) + + # Resolution of 20 cents + self.cents = np.rint(np.load(file_pitch)[0,:]/20) + self.cents = np.clip(self.cents,0,179) + self.confidence = np.load(file_pitch)[1,:] + + # Filter confidence for CREPE + self.confidence[self.confidence < confidence_threshold] = 0 + self.context = context + # Clip both to same size + size_common = min(self.if_feat.shape[0],self.cents.shape[0]) + self.if_feat = self.if_feat[:size_common,:] + self.cents = self.cents[:size_common] + self.confidence = self.confidence[:size_common] + + frame_max = self.if_feat.shape[0]//context + self.if_feat = np.reshape(self.if_feat[:frame_max*context,:],(frame_max,context,3*dimension_if)) + self.cents = np.reshape(self.cents[:frame_max*context],(frame_max,context)) + self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max,context)) + + def __len__(self): + return self.if_feat.shape[0] + + def __getitem__(self, index): + return torch.from_numpy(self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index]) + +class loader_joint(torch.utils.data.Dataset): + def __init__(self, features_if, file_pitch, features_xcorr,confidence_threshold = 0.4,context = 100, choice_data = 'both'): + self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,90) + self.xcorr = np.memmap(features_xcorr, dtype=np.float32).reshape(-1,257) + self.cents = np.rint(np.load(file_pitch)[0,:]/20) + self.cents = np.clip(self.cents,0,179) + self.confidence = np.load(file_pitch)[1,:] + # Filter confidence for CREPE + self.confidence[self.confidence < confidence_threshold] = 0 + self.context = context + + self.choice_data = choice_data + + frame_max = self.if_feat.shape[0]//context + self.if_feat = np.reshape(self.if_feat[:frame_max*context,:],(frame_max,context,90)) + self.cents = np.reshape(self.cents[:frame_max*context],(frame_max,context)) + self.xcorr = np.reshape(self.xcorr[:frame_max*context,:],(frame_max,context,257)) + # self.cents = np.rint(60*np.log2(256/(self.periods + 1.0e-8))).astype('int') + # self.cents = np.clip(self.cents,0,239) + self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max,context)) + # print(self.if_feat.shape) + def __len__(self): + return self.if_feat.shape[0] + + def __getitem__(self, index): + if self.choice_data == 'both': + return torch.cat([torch.from_numpy(self.xcorr[index,:,:]),torch.from_numpy(self.if_feat[index,:,:])],dim=-1),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index]) + elif self.choice_data == 'if': + return torch.from_numpy(self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index]) + else: + return torch.from_numpy(self.xcorr[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index]) diff --git a/dnn/torch/neural-pitch/neural_pitch_update.py b/dnn/torch/neural-pitch/neural_pitch_update.py new file mode 100644 index 000000000..5d8074cff --- /dev/null +++ b/dnn/torch/neural-pitch/neural_pitch_update.py @@ -0,0 +1,207 @@ +import argparse +parser = argparse.ArgumentParser() + +parser.add_argument('features', type=str, help='Features generated from dump_data') +parser.add_argument('data', type=str, help='Data generated from dump_data (offset by 5ms)') +parser.add_argument('output', type=str, help='output .f32 feature file with replaced neural pitch') +parser.add_argument('pth_file', type=str, help='.pth file to use for pitch') +parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)') +parser.add_argument('--device', type=str, help='compute device',default = None,required = False) +parser.add_argument('--replace_xcorr', type = bool, default = False, help='Replace LPCNet xcorr with updated one') + +args = parser.parse_args() + +import os + +from utils import stft, random_filter +import subprocess +import numpy as np +import json +import torch +import tqdm + + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +if device is not None: + device = torch.device(args.device) + +# Loading the appropriate model +config_path = os.path.dirname(args.pth_file) + '/' + os.path.basename(args.pth_file).split('_')[0] + '_' + 'config_' + os.path.basename(args.pth_file).split('_')[-1][:-4] + '.json' +with open(config_path) as json_file: + dict_params = json.load(json_file) + +if dict_params['data_format'] == 'if': + from models import large_if_ccode as model + pitch_nn = model(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']).to(device) +elif dict_params['data_format'] == 'xcorr': + from models import large_xcorr as model + pitch_nn = model(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device) +else: + from models import large_joint as model + pitch_nn = model(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device) + +pitch_nn.load_state_dict(torch.load(args.pth_file)) +pitch_nn = pitch_nn.to(device) + +N = dict_params['window_size'] +H = dict_params['hop_factor'] +freq_keep = dict_params['freq_keep'] + +# import os +# import argparse + + + +# os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["OMP_NUM_THREADS"] = "16" + +# parser = argparse.ArgumentParser() + +# parser.add_argument('features', type=str, help='input features') +# parser.add_argument('data', type=str, help='input data') +# parser.add_argument('output', type=str, help='output features') +# parser.add_argument('--add-confidence', action='store_true', help='add CREPE confidence to features') +# parser.add_argument('--viterbi', action='store_true', help='enable viterbi algo for pitch tracking') + + +def run_lpc(signal, lpcs, frame_length=160): + num_frames, lpc_order = lpcs.shape + + prediction = np.concatenate( + [- np.convolve(signal[i * frame_length : (i + 1) * frame_length + lpc_order - 1], lpcs[i], mode='valid') for i in range(num_frames)] + ) + error = signal[lpc_order :] - prediction + + return prediction, error + + +if __name__ == "__main__": + args = parser.parse_args() + + features = np.memmap(args.features, dtype=np.float32,mode = 'r').reshape((-1, 36)) + data = np.memmap(args.data, dtype=np.int16,mode = 'r').reshape((-1, 2)) + + num_frames = features.shape[0] + feature_dim = features.shape[1] + + assert feature_dim == 36 + + # if args.add_confidence: + # feature_dim += 1 + + output = np.memmap(args.output, dtype=np.float32, shape=(num_frames, feature_dim), mode='w+') + output[:, :36] = features + + # lpc coefficients and signal + lpcs = features[:, 20:36] + sig = data[:, 1] + + # parameters + # use_viterbi=args.viterbi + + # constants + pitch_min = 32 + pitch_max = 256 + lpc_order = 16 + fs = 16000 + frame_length = 160 + overlap_frames = 100 + chunk_size = 10000 + history_length = frame_length * overlap_frames + history = np.zeros(history_length, dtype=np.int16) + pitch_position=18 + xcorr_position=19 + conf_position=36 + + num_frames = len(sig) // 160 - 1 + + frame_start = 0 + frame_stop = min(frame_start + chunk_size, num_frames) + signal_start = 0 + signal_stop = frame_stop * frame_length + + niters = (num_frames - 1)//chunk_size + for i in tqdm.trange(niters): + if (frame_start > num_frames - 1): + break + chunk = np.concatenate((history, sig[signal_start:signal_stop])) + chunk_la = np.concatenate((history, sig[signal_start:signal_stop + 80])) + # time, frequency, confidence, _ = crepe.predict(chunk, fs, center=True, viterbi=True,verbose=0) + + # Feature computation + spec = stft(x = np.concatenate([np.zeros(80),chunk_la/(2**15 - 1)]), w = 'boxcar', N = N, H = H).T + phase_diff = spec*np.conj(np.roll(spec,1,axis = -1)) + phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8) + idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)]) + feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T + feature_if = feature[:,idx_save] + + data_temp = np.memmap('./temp_featcompute_' + dict_params['data_format'] + '_.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+') + data_temp[:chunk.shape[0]] = chunk_la[80:].astype(np.int16) + + subprocess.run([args.path_lpcnet_extractor, './temp_featcompute_' + dict_params['data_format'] + '_.raw', './temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw']) + feature_xcorr = np.flip(np.fromfile('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw', dtype='float32').reshape((-1,256),order = 'C'),axis = 1) + ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1) + feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1) + + os.remove('./temp_featcompute_' + dict_params['data_format'] + '_.raw') + os.remove('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw') + + if dict_params['data_format'] == 'if': + feature = feature_if + elif dict_params['data_format'] == 'xcorr': + feature = feature_xcorr + else: + indmin = min(feature_if.shape[0],feature_xcorr.shape[0]) + feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1) + + # Compute pitch with my model + model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device)) + model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy() + frequency = 62.5*2**(model_cents/1200) + + frequency = frequency[overlap_frames : overlap_frames + frame_stop - frame_start] + # confidence = confidence[overlap_frames : overlap_frames + frame_stop - frame_start] + + # convert frequencies to periods + periods = np.round(fs / frequency) + + # adjust to pitch range + # confidence[periods < pitch_min] = 0 + # confidence[periods > pitch_max] = 0 + periods = np.clip(periods, pitch_min, pitch_max) + + output[frame_start:frame_stop, pitch_position] = (periods - 100) / 50 + + # if args.replace_xcorr: + # re-calculate xcorr + frame_offset = (pitch_max + frame_length - 1) // frame_length + offset = frame_offset * frame_length + padding = lpc_order + + + if frame_start < frame_offset: + lpc_coeffs = np.concatenate((np.zeros((frame_offset - frame_start, lpc_order), dtype=np.float32), lpcs[:frame_stop])) + else: + lpc_coeffs = lpcs[frame_start - frame_offset : frame_stop] + + pred, error = run_lpc(chunk[history_length - offset - padding :], lpc_coeffs, frame_length=frame_length) + + xcorr = np.zeros(frame_stop - frame_start) + for i, p in enumerate(periods.astype(np.int16)): + if p > 0: + f1 = error[offset + i * frame_length : offset + (i + 1) * frame_length] + f2 = error[offset + i * frame_length - p : offset + (i + 1) * frame_length - p] + xcorr[i] = np.dot(f1, f2) / np.sqrt(np.dot(f1, f1) * np.dot(f2, f2) + 1e-6) + + output[frame_start:frame_stop, xcorr_position] = xcorr - 0.5 + + # update buffers and indices + history = chunk[-history_length :] + + frame_start += chunk_size + frame_stop += chunk_size + frame_stop = min(frame_stop, num_frames) + + signal_start = frame_start * frame_length + signal_stop = frame_stop * frame_length diff --git a/dnn/torch/neural-pitch/ptdb_process.sh b/dnn/torch/neural-pitch/ptdb_process.sh new file mode 100644 index 000000000..f4df54659 --- /dev/null +++ b/dnn/torch/neural-pitch/ptdb_process.sh @@ -0,0 +1,34 @@ +# Copy into PTDB root directory and run to combine all the male/female raw audio/references into below directories + +# Make folder for combined audio +mkdir -p './combined_mic_16k/' +# Make folder for combined pitch reference +mkdir -p './combined_reference_f0/' + +# Resample Male Audio +for i in ./MALE/MIC/**/*.wav; do +j="$(basename "$i" .wav)" +echo $j +sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw +done + +# Resample Female Audio +for i in ./FEMALE/MIC/**/*.wav; do +j="$(basename "$i" .wav)" +echo $j +sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw +done + +# Shift Male reference pitch files +for i in ./MALE/REF/**/*.f0; do +j="$(basename "$i" .wav)" +echo $j +cp "$i" ./combined_reference_f0/ +done + +# Shift Female reference pitch files +for i in ./FEMALE/REF/**/*.f0; do +j="$(basename "$i" .wav)" +echo $j +cp "$i" ./combined_reference_f0/ +done \ No newline at end of file diff --git a/dnn/torch/neural-pitch/training.py b/dnn/torch/neural-pitch/training.py new file mode 100644 index 000000000..bc0cce7ca --- /dev/null +++ b/dnn/torch/neural-pitch/training.py @@ -0,0 +1,162 @@ +""" +Training the neural pitch estimator + +""" + +import argparse +parser = argparse.ArgumentParser() + +parser.add_argument('features_if', type=str, help='.f32 IF Features for training (generated by augmentation script)') +parser.add_argument('features_xcorr', type=str, help='.f32 Xcorr Features for training (generated by augmentation script)') +parser.add_argument('features_pitch', type=str, help='.npy Pitch file for training (generated by augmentation script)') +parser.add_argument('output_folder', type=str, help='Output directory to store the model weights and config') +parser.add_argument('data_format', type=str, help='Choice of Input Data',choices=['if','xcorr','both']) +parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False) +parser.add_argument('--confidence_threshold', type=float, help='Confidence value below which pitch will be neglected during training',default = 0.4,required = False) +parser.add_argument('--context', type=int, help='Sequence length during training',default = 100,required = False) +parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False) +parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False) +parser.add_argument('--xcorr_dimension', type=int, help='Dimension of Input cross-correlation',default = 257,required = False) +parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False) +parser.add_argument('--gru_dim', type=int, help='GRU Dimension',default = 64,required = False) +parser.add_argument('--output_dim', type=int, help='Output dimension',default = 192,required = False) +parser.add_argument('--learning_rate', type=float, help='Learning Rate',default = 1.0e-3,required = False) +parser.add_argument('--epochs', type=int, help='Number of training epochs',default = 50,required = False) +parser.add_argument('--choice_cel', type=str, help='Choice of Cross Entropy Loss (default or robust)',choices=['default','robust'],default = 'default',required = False) + + +args = parser.parse_args() + +# import os +# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +# os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index) + +# Fixing the seeds for reproducability +import time +np_seed = int(time.time()) +torch_seed = int(time.time()) + +import json +import torch +torch.manual_seed(torch_seed) +import numpy as np +np.random.seed(np_seed) +from utils import count_parameters +import tqdm +import sys +from datetime import datetime +from evaluation import rpa + +# print(list(range(torch.cuda.device_count()))) +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# device = 'cpu' + +from models import loader_joint as loader +if args.data_format == 'if': + from models import large_if_ccode as model + pitch_nn = model(args.freq_keep*3,args.gru_dim,args.output_dim) +elif args.data_format == 'xcorr': + from models import large_xcorr as model + pitch_nn = model(args.xcorr_dimension,args.gru_dim,args.output_dim) +else: + from models import large_joint as model + pitch_nn = model(args.freq_keep*3,args.xcorr_dimension,args.gru_dim,args.output_dim) + +dataset_training = loader(args.features_if,args.features_pitch,args.features_xcorr,args.confidence_threshold,args.context,args.data_format) + +def loss_custom(logits,labels,confidence,choice = 'default',nmax = 192,q = 0.7): + logits_softmax = torch.nn.Softmax(dim = 1)(logits).permute(0,2,1) + labels_one_hot = torch.nn.functional.one_hot(labels.long(),nmax) + + if choice == 'default': + # Categorical Cross Entropy + CE = -torch.sum(torch.log(logits_softmax*labels_one_hot + 1.0e-6)*labels_one_hot,dim=-1) + CE = torch.sum(confidence*CE) + + else: + # Robust Cross Entropy + CE = (1.0/q)*(1 - torch.sum(torch.pow(logits_softmax*labels_one_hot + 1.0e-7,q),dim=-1) ) + CE = torch.sum(confidence*CE) + + return CE + +# features = args.features +# pitch = args.crepe_pitch +# dataset_training = loader(features,pitch,args.confidence_threshold,args.freq_keep,args.context) +# dataset_training = loader(features,pitch,'../../../../testing/testing_features_10pct_xcorr.f32') + +train_dataset, test_dataset = torch.utils.data.random_split(dataset_training, [0.95,0.05],generator=torch.Generator().manual_seed(torch_seed)) + +batch_size = 256 +train_dataloader = torch.utils.data.DataLoader(dataset = train_dataset,batch_size = batch_size,shuffle = True,num_workers = 0, pin_memory = False) +test_dataloader = torch.utils.data.DataLoader(dataset = test_dataset,batch_size = batch_size,shuffle = True,num_workers = 0, pin_memory = False) + +# pitch_nn = model(args.freq_keep*3,args.gru_dim,args.output_dim).to(device) +pitch_nn = pitch_nn.to(device) +num_params = count_parameters(pitch_nn) +learning_rate = args.learning_rate +model_opt = torch.optim.Adam(pitch_nn.parameters(), lr = learning_rate) + +num_epochs = args.epochs + +for epoch in range(num_epochs): + losses = [] + pitch_nn.train() + with tqdm.tqdm(train_dataloader) as train_epoch: + for i, (xi, yi, ci) in enumerate(train_epoch): + yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True) + pi = pitch_nn(xi.float()) + loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim) + + model_opt.zero_grad() + loss.backward() + model_opt.step() + + losses.append(loss.item()) + avg_loss = np.mean(losses) + train_epoch.set_postfix({"Train Epoch" : epoch, "Train Loss":avg_loss}) + + if epoch % 5 == 0: + pitch_nn.eval() + losses = [] + with tqdm.tqdm(test_dataloader) as test_epoch: + for i, (xi, yi, ci) in enumerate(test_epoch): + yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True) + pi = pitch_nn(xi.float()) + loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim) + losses.append(loss.item()) + avg_loss = np.mean(losses) + test_epoch.set_postfix({"Epoch" : epoch, "Test Loss":avg_loss}) + +pitch_nn.eval() +rpa(pitch_nn,device,data_format = args.data_format) + +config = dict( +data_format = args.data_format, +epochs = num_epochs, +window_size = args.N, +hop_factor = args.H, +freq_keep = args.freq_keep, +batch_size = batch_size, +learning_rate = learning_rate, +confidence_threshold = args.confidence_threshold, +model_parameters = num_params, +np_seed = np_seed, +torch_seed = torch_seed, +xcorr_dim = args.xcorr_dimension, +dim_input = 3*args.freq_keep, +gru_dim = args.gru_dim, +output_dim = args.output_dim, +choice_cel = args.choice_cel, +context = args.context, +) + +now = datetime.now() +dir_pth_save = args.output_folder +dir_network = dir_pth_save + str(now) + '_net_' + args.data_format + '.pth' +dir_dictparams = dir_pth_save + str(now) + '_config_' + args.data_format + '.json' +# Save Weights +torch.save(pitch_nn.state_dict(), dir_network) +# Save Config +with open(dir_dictparams, 'w') as fp: + json.dump(config, fp) diff --git a/dnn/torch/neural-pitch/utils.py b/dnn/torch/neural-pitch/utils.py new file mode 100644 index 000000000..8930ad198 --- /dev/null +++ b/dnn/torch/neural-pitch/utils.py @@ -0,0 +1,59 @@ +""" +Utility functions that are commonly used +""" + +import numpy as np +from scipy.signal import windows, lfilter +from prettytable import PrettyTable + + +# Source: https://gist.github.com/thongonary/026210fc186eb5056f2b6f1ca362d912 +def count_parameters(model): + table = PrettyTable(["Modules", "Parameters"]) + total_params = 0 + for name, parameter in model.named_parameters(): + if not parameter.requires_grad: continue + param = parameter.numel() + table.add_row([name, param]) + total_params+=param + print(table) + print(f"Total Trainable Params: {total_params}") + return total_params + +def stft(x, w = 'boxcar', N = 320, H = 160): + x = np.concatenate([x,np.zeros(N)]) + # win_custom = np.concatenate([windows.hann(80)[:40],np.ones(240),windows.hann(80)[40:]]) + return np.stack([np.fft.rfft(x[i:i + N]*windows.get_window(w,N)) for i in np.arange(0,x.shape[0]-N,H)]) + +def random_filter(x): + # Randomly filter x with second order IIR filter with coefficients in between -3/8,3/8 + filter_coeff = np.random.uniform(low = -3.0/8, high = 3.0/8, size = 4) + b = [1,filter_coeff[0],filter_coeff[1]] + a = [1,filter_coeff[2],filter_coeff[3]] + return lfilter(b,a,x) + +def feature_xform(feature): + """ + Take as input the (N * 256) xcorr features output by LPCNet and perform the following + 1. Downsample and Upsample by 2 (followed by smoothing) + 2. Append positional embeddings (of dim k) coresponding to each xcorr lag + """ + + from scipy.signal import resample_poly, lfilter + + + feature_US = lfilter([0.25,0.5,0.25],[1],resample_poly(feature,2,1,axis = 1),axis = 1)[:,:feature.shape[1]] + feature_DS = lfilter([0.5,0.5],[1],resample_poly(feature,1,2,axis = 1),axis = 1) + Z_append = np.zeros((feature.shape[0],feature.shape[1] - feature_DS.shape[1])) + feature_DS = np.concatenate([feature_DS,Z_append],axis = -1) + + # pos_embedding = [] + # for i in range(k): + # pos_embedding.append(np.cos((2**i)*np.pi*((np.repeat(np.arange(feature.shape[1]).reshape(feature.shape[1],1),feature.shape[0],axis = 1)).T/(2*feature.shape[1])))) + + # pos_embedding = np.stack(pos_embedding,axis = -1) + + feature = np.stack((feature_DS,feature,feature_US),axis = -1) + # feature = np.concatenate((feature,pos_embedding),axis = -1) + + return feature -- GitLab