diff --git a/dnn/training_tf2/fec_encoder.py b/dnn/training_tf2/fec_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..e473f05c0200080310015672db3fc8156b3c01d2 --- /dev/null +++ b/dnn/training_tf2/fec_encoder.py @@ -0,0 +1,176 @@ + +import os +import subprocess +import argparse + + +import numpy as np +from scipy.io import wavfile +import tensorflow as tf + +from rdovae import new_rdovae_model, pvq_quantize, apply_dead_zone, sq_rate_metric +from fec_packets import write_fec_packets, read_fec_packets + + +debug = False + +if debug: + args = type('dummy', (object,), + { + 'input' : 'item1.wav', + 'weights' : 'testout/rdovae_alignment_fix_1024_120.h5', + 'enc_lambda' : 0.0007, + 'output' : "test_0007.fec", + 'cond_size' : 1024, + 'num_redundancy_frames' : 64, + 'extra_delay' : 0, + 'dump_data' : './dump_data' + })() + os.environ['CUDA_VISIBLE_DEVICES']="" +else: + parser = argparse.ArgumentParser(description='Encode redundancy for Opus neural FEC. Designed for use with voip application and 20ms frames') + + parser.add_argument('input', metavar='<input signal>', help='audio input (.wav or .raw or .pcm as int16)') + parser.add_argument('weights', metavar='<weights>', help='trained model file (.h5)') + parser.add_argument('enc_lambda', metavar='<lambda>', type=float, help='lambda for controlling encoder rate (default=0.0007)', default=0.0007) + parser.add_argument('output', type=str, help='output file (will be extended with .fec)') + + parser.add_argument('--dump-data', type=str, default='./dump_data', help='path to dump data executable (default ./dump_data)') + parser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)') + parser.add_argument('--num-redundancy-frames', default=64, type=int, help='number of redundancy frames per packet (default 64)') + parser.add_argument('--extra-delay', default=0, type=int, help="last features in packet are calculated with the decoder aligned samples, use this option to add extra delay (in samples at 16kHz)") + + args = parser.parse_args() + +model, encoder, decoder = new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=1, cond_size=args.cond_size) +model.load_weights(args.weights) + +lpc_order = 16 + +## prepare input signal +# SILK frame size is 20ms and LPCNet subframes are 10ms +subframe_size = 160 +frame_size = 2 * subframe_size + +# 91 samples delay to align with SILK decoded frames +silk_delay = 91 + +# prepend zeros to have enough history to produce the first package +zero_history = (args.num_redundancy_frames - 1) * frame_size + +total_delay = silk_delay + zero_history + args.extra_delay + +# load signal +if args.input.endswith('.raw') or args.input.endswith('.pcm'): + signal = np.fromfile(args.input, dtype='int16') + +elif args.input.endswith('.wav'): + fs, signal = wavfile.read(args.input) +else: + raise ValueError(f'unknown input signal format: {args.input}') + +# fill up last frame with zeros +padded_signal_length = len(signal) + total_delay +tail = padded_signal_length % frame_size +right_padding = (frame_size - tail) % frame_size + +signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16))) + +padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw' +signal.tofile(padded_signal_file) + +# write signal and call dump_data to create features + +feature_file = os.path.splitext(args.input)[0] + '_features.f32' +command = f"{args.dump_data} -test {padded_signal_file} {feature_file}" +r = subprocess.run(command, shell=True) +if r.returncode != 0: + raise RuntimeError(f"command '{command}' failed with exit code {r.returncode}") + +# load features +nb_features = model.nb_used_features + lpc_order +nb_used_features = model.nb_used_features + +# load features +features = np.fromfile(feature_file, dtype='float32') +num_subframes = len(features) // nb_features +num_subframes = 2 * (num_subframes // 2) +num_frames = num_subframes // 2 + +features = np.reshape(features, (1, -1, nb_features)) +features = features[:, :, :nb_used_features] +features = features[:, :num_subframes, :] + +# lambda and q_id (ToDo: check validity of lambda and q_id) +enc_lambda = args.enc_lambda * np.ones((1, num_frames, 1)) +quant_id = np.round(10*np.log(enc_lambda/.0007)).astype('int16') + + +# run encoder +print("running fec encoder...") +symbols, quant_embed_dec, gru_state_dec = encoder.predict([features, quant_id, enc_lambda]) + +# apply quantization +nsymbols = 80 +dead_zone = tf.math.softplus(quant_embed_dec[:, :, nsymbols : 2 * nsymbols]) +symbols = apply_dead_zone([symbols, dead_zone]).numpy() +qsymbols = np.round(symbols) +quant_gru_state_dec = pvq_quantize(gru_state_dec, 30) + +# rate estimate +hard_distr_embed = tf.math.sigmoid(quant_embed_dec[:, :, 4 * nsymbols : ]).numpy() +rate_input = np.concatenate((symbols, hard_distr_embed, enc_lambda), axis=-1) +rates = sq_rate_metric(None, rate_input, reduce=False).numpy() + +# run decoder +input_length = args.num_redundancy_frames // 2 +offset = args.num_redundancy_frames - 1 + +packets = [] +packet_sizes = [] + +for i in range(offset, num_frames): + print(f"processing frame {i - offset}...") + features = decoder.predict([symbols[:, i - 2 * input_length + 1 : i + 1 : 2, :], quant_embed_dec[:, :input_length, :], quant_gru_state_dec[:, i, :]]) + packets.append(features) + packet_size = 8 * int((np.sum(rates[:, i - 2 * input_length + 1 : i + 1 : 2]) + 7) / 8) + 64 + packet_sizes.append(packet_size) + + +# write packets +packet_file = args.output + '.fec' if not args.output.endswith('.fec') else args.output +write_fec_packets(packet_file, packets, packet_sizes) + + +print(f"average redundancy rate: {int(round(sum(packet_sizes) / len(packet_sizes) * 50 / 1000))} kbps") + + +if False: + + # sanity check + packets2 = read_fec_packets(packet_file) + + print(f"{len(packets)=} {len(packets2)=}") + + print(f"{packets[0][0, 0]=}") + print(f"{packets2[0][0, 0]=}") + + # sanity checks + # 1. concatenate features at offset 0 + + test_features_batch2 = np.concatenate([packet[:,-2:, :] for packet in packets], axis=1) + print(f"{test_features_batch2.shape=}") + + test_features_full_batch2 = np.zeros((test_features_batch2.shape[1], nb_features), dtype=np.float32) + test_features_full_batch2[:, :nb_used_features] = test_features_batch2[0, :, :] + + test_features_full_batch2.tofile('test_features_batch2.f32') + + # 2. concatenate in batches of 4 + test_features_batch4 = np.concatenate([packet[:,-4:, :] for packet in packets[::2]], axis=1) + print(f"{test_features_batch4.shape=}") + + test_features_full_batch4 = np.zeros((test_features_batch4.shape[1], nb_features), dtype=np.float32) + test_features_full_batch4[:, :nb_used_features] = test_features_batch4[0, :, :] + + test_features_full_batch4.tofile('test_features_batch4.f32') diff --git a/dnn/training_tf2/fec_packets.c b/dnn/training_tf2/fec_packets.c new file mode 100644 index 0000000000000000000000000000000000000000..73b695bd4b925d0f5e0d71b5dc779088e4ae2191 --- /dev/null +++ b/dnn/training_tf2/fec_packets.c @@ -0,0 +1,115 @@ +#include <stdio.h> +#include <inttypes.h> + +#include "fec_packets.h" + +int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index) +{ + + int16_t version; + int16_t header_size; + int16_t num_packets; + int16_t packet_size; + int16_t subframe_size; + int16_t subframes_per_packet; + int16_t num_features; + long offset; + + FILE *fid = fopen(filename, "rb"); + + /* read header */ + if (fread(&version, sizeof(version), 1, fid) != 1) goto error; + if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error; + if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error; + if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error; + if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error; + if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error; + if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error; + + /* check if indices are valid */ + if (packet_index >= num_packets || subframe_index >= subframes_per_packet) + { + fprintf(stderr, "get_fec_frame: index out of bounds\n"); + goto error; + } + + /* calculate offset in file (+ 2 is for rate) */ + offset = header_size + packet_index * packet_size + 2 + subframe_index * subframe_size; + fseek(fid, offset, SEEK_SET); + + /* read features */ + if (fread(features, sizeof(*features), num_features, fid) != num_features) goto error; + + fclose(fid); + return 0; + +error: + fclose(fid); + return 1; +} + +int get_fec_rate(const char * const filename, int packet_index) +{ + int16_t version; + int16_t header_size; + int16_t num_packets; + int16_t packet_size; + int16_t subframe_size; + int16_t subframes_per_packet; + int16_t num_features; + long offset; + int16_t rate; + + FILE *fid = fopen(filename, "rb"); + + /* read header */ + if (fread(&version, sizeof(version), 1, fid) != 1) goto error; + if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error; + if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error; + if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error; + if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error; + if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error; + if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error; + + /* check if indices are valid */ + if (packet_index >= num_packets) + { + fprintf(stderr, "get_fec_rate: index out of bounds\n"); + goto error; + } + + /* calculate offset in file (+ 2 is for rate) */ + offset = header_size + packet_index * packet_size; + fseek(fid, offset, SEEK_SET); + + /* read rate */ + if (fread(&rate, sizeof(rate), 1, fid) != 1) goto error; + + fclose(fid); + return (int) rate; + +error: + fclose(fid); + return -1; +} + +#if 0 +int main() +{ + float features[20]; + int i; + + if (get_fec_frame("../test.fec", &features[0], 0, 127)) + { + return 1; + } + + for (i = 0; i < 20; i ++) + { + printf("%d %f\n", i, features[i]); + } + + printf("rate: %d\n", get_fec_rate("../test.fec", 0)); + +} +#endif \ No newline at end of file diff --git a/dnn/training_tf2/fec_packets.h b/dnn/training_tf2/fec_packets.h new file mode 100644 index 0000000000000000000000000000000000000000..b7591560b308336f75048f1597554aecdd1cb9c8 --- /dev/null +++ b/dnn/training_tf2/fec_packets.h @@ -0,0 +1,7 @@ +#ifndef _FEC_PACKETS_H +#define _FEC_PACKETS_H + +int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index); +int get_fec_rate(const char * const filename, int packet_index); + +#endif \ No newline at end of file diff --git a/dnn/training_tf2/fec_packets.py b/dnn/training_tf2/fec_packets.py new file mode 100644 index 0000000000000000000000000000000000000000..0aab2a4e65f392726a726886d1aad99b3c363bae --- /dev/null +++ b/dnn/training_tf2/fec_packets.py @@ -0,0 +1,79 @@ +import numpy as np + + + +def write_fec_packets(filename, packets, rates=None): + """ writes packets in binary format """ + + assert np.dtype(np.float32).itemsize == 4 + assert np.dtype(np.int16).itemsize == 2 + + # derive some sizes + num_packets = len(packets) + subframes_per_packet = packets[0].shape[-2] + num_features = packets[0].shape[-1] + + # size of float is 4 + subframe_size = num_features * 4 + packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate + + version = 1 + # header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features) + header_size = 14 + + with open(filename, 'wb') as f: + + # header + f.write(np.int16(version).tobytes()) + f.write(np.int16(header_size).tobytes()) + f.write(np.int16(num_packets).tobytes()) + f.write(np.int16(packet_size).tobytes()) + f.write(np.int16(subframe_size).tobytes()) + f.write(np.int16(subframes_per_packet).tobytes()) + f.write(np.int16(num_features).tobytes()) + + # packets + for i, packet in enumerate(packets): + if type(rates) == type(None): + rate = 0 + else: + rate = rates[i] + + f.write(np.int16(rate).tobytes()) + + features = np.flip(packet, axis=-2) + f.write(features.astype(np.float32).tobytes()) + + +def read_fec_packets(filename): + """ reads packets from binary format """ + + assert np.dtype(np.float32).itemsize == 4 + assert np.dtype(np.int16).itemsize == 2 + + with open(filename, 'rb') as f: + + # header + version = np.frombuffer(f.read(2), dtype=np.int16).item() + header_size = np.frombuffer(f.read(2), dtype=np.int16).item() + num_packets = np.frombuffer(f.read(2), dtype=np.int16).item() + packet_size = np.frombuffer(f.read(2), dtype=np.int16).item() + subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item() + subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item() + num_features = np.frombuffer(f.read(2), dtype=np.int16).item() + + dummy_features = np.zeros((1, subframes_per_packet, num_features), dtype=np.float32) + + # packets + rates = [] + packets = [] + for i in range(num_packets): + + rate = np.frombuffer(f.read(2), dtype=np.int16).item + rates.append(rate) + + features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape) + packet = np.flip(features, axis=-2) + packets.append(packet) + + return packets \ No newline at end of file