Deep Network with Batch Normalization Training Example [Single Thread] [Multi Thread]

Network Architecture

General architecture of this Network:

input 640x640x3 -> convolutional layer with 7x7x64 kernel dimensions, stride = 3,leaky relu, max pooling 2x2 dimensions, stride = 2, local response normalization -> convolutional layer with 3x3x192 kernel dimensions, stride = 1, leaky relu, max pooling 2x2 dimensions, stride = 2, local response normalization -> residual layer with 4 convolutional layers: 1° convolutional layer with 1x1x128 kernel dimensions, stride = 1, leaky relu, local response normalization, 2° convolutional layer with 3x3x256 kernel dimensions, stride = 1, leaky relu, local response normalization, 3° convolutional layer with 1x1x256 kernel dimensions, stride = 1, leaky relu, local response normalization, 4° convolutional layer with 3x3x192 kernel dimensions, stride = 1, leaky relu, padding = 2 -> batch normalization -> max pooling 2x2 dimensions, stride = 2 -> residual layer with 4 convolutional layers: 1° convolutional layer with 1x1x128 kernel dimensions, stride = 1, leaky relu, local response normalization, 2° convolutional layer with 3x3x256 kernel dimensions, stride = 1, leaky relu, local response normalization, 3° convolutional layer with 1x1x256 kernel dimensions, stride = 1, leaky relu, local response normalization, 4° convolutional layer with 3x3x192 kernel dimensions, stride = 1, leaky relu, padding = 2 -> batch normalization -> max pooling 2x2 dimensions, stride = 2 -> residual layer with 4 convolutional layers: 1° convolutional layer with 1x1x128 kernel dimensions, stride = 1, leaky relu, local response normalization, 2° convolutional layer with 3x3x256 kernel dimensions, stride = 1, leaky relu, local response normalization, 3° convolutional layer with 1x1x256 kernel dimensions, stride = 1, leaky relu, local response normalization, 4° convolutional layer with 3x3x192 kernel dimensions, stride = 1, leaky relu, padding = 2 -> batch normalization -> max pooling 2x2 dimensions, stride = 2 -> fully connected layer with 1024 output neurons, leaky relu -> fully connected layer with 1024 output neurons, leaky relu

Code Single Thread

#include <stdio.h>
#include <stdlib.h>
#include "llab.h"


int main(){
    
    /* First model with 6 convolutional layers* */
    /* Second model with 4 convolutional layers grouped up in a residual layer */
    /* Third model with 4 convolutional layers grouped up in a residual layer */
    
    int output = 1024; //not chosen yet
    int input_channels = 3, input_rows = 640, input_cols = 640;
    int batch_size = 2,n_instances = 10;
    float lr, b1_adam1 = BETA1_ADAM, b2_adam1 = BETA2_ADAM;
    float b1_adam2 = BETA1_ADAM, b2_adam2 = BETA2_ADAM;
    float b1_adam3 = BETA1_ADAM, b2_adam3 = BETA2_ADAM;
    float b1_adam4 = BETA1_ADAM, b2_adam4 = BETA2_ADAM;
    float b1_adam5 = BETA1_ADAM, b2_adam5 = BETA2_ADAM;
    float b1_adam6 = BETA1_ADAM, b2_adam6 = BETA2_ADAM;
    float b1_adam7 = BETA1_ADAM, b2_adam7 = BETA2_ADAM;
    int i,j;
    
    float** input = (float**)malloc(sizeof(float*)*n_instances);
    
    for(i = 0; i < n_instances; i++){
        input[i] = (float*)calloc(input_channels*input_rows*input_cols,sizeof(float));
    }
        
    cl** c1 = (cl**)malloc(sizeof(cl*)*2);
    cl** c2 = (cl**)malloc(sizeof(cl*)*4);
    cl** c3 = (cl**)malloc(sizeof(cl*)*4);
    cl** c4 = (cl**)malloc(sizeof(cl*)*4);
    rl** r1 = (rl**)malloc(sizeof(rl*));
    rl** r2 = (rl**)malloc(sizeof(rl*));
    rl** r3 = (rl**)malloc(sizeof(rl*));
    fcl** f1 = (fcl**)malloc(sizeof(fcl*)*2);
    
    /* batch normalization and max-pooling after the 3 residual layers*/
    bn** b1 = (bn**)malloc(sizeof(bn*)*3);
    cl** c5 = (cl**)malloc(sizeof(cl*));
    cl** c6 = (cl**)malloc(sizeof(cl*));
    cl** c7 = (cl**)malloc(sizeof(cl*));
    
    // First 2 convolutional layers
    c1[0] = convolutional(3,640,640,7,7,64,3,3,0,0,2,2,0,0,2,2,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,MAX_POOLING,0,CONVOLUTION);
    c1[1] = convolutional(64,106,106,3,3,192,1,1,0,0,2,2,0,0,2,2,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,MAX_POOLING,1,CONVOLUTION);
    
    // 4 convolutional layer of first residual layer
    c2[0] = convolutional(192,52,52,1,1,128,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,2,CONVOLUTION);
    c2[1] = convolutional(128,52,52,3,3,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,3,CONVOLUTION);
    c2[2] = convolutional(256,50,50,1,1,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,4,CONVOLUTION);
    c2[3] = convolutional(256,50,50,3,3,192,1,1,2,2,1,1,0,0,0,0,NO_NORMALIZATION,LEAKY_RELU,NO_POOLING,5,CONVOLUTION);
    
    //batch normalization
    //max pooling 2x2 s = 2
    
    // 4 convolutional layer of second residual layer
    c3[0] = convolutional(192,26,26,1,1,128,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,1,CONVOLUTION);
    c3[1] = convolutional(128,26,26,3,3,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,2,CONVOLUTION);
    c3[2] = convolutional(256,24,24,1,1,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,3,CONVOLUTION);
    c3[3] = convolutional(256,24,24,3,3,192,1,1,2,2,1,1,0,0,0,0,NO_NORMALIZATION,LEAKY_RELU,NO_POOLING,4,CONVOLUTION);
    
    //batch normalization
    //max pooling 2x2 s = 2
    
    // 4 convolutional layer of third residual layer
    c4[0] = convolutional(192,13,13,1,1,128,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,1,CONVOLUTION);
    c4[1] = convolutional(128,13,13,3,3,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,2,CONVOLUTION);
    c4[2] = convolutional(256,11,11,1,1,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,3,CONVOLUTION);
    c4[3] = convolutional(256,11,11,3,3,192,1,1,2,2,1,1,0,0,0,0,NO_NORMALIZATION,LEAKY_RELU,NO_POOLING,4,CONVOLUTION);
    
    //batch normalization
    //max pooling 2x2 s = 2
    
    //192x6x6 total output at this moment
    
    // 2 terminal fully connected layers
    f1[0] = fully_connected(192*6*6,1024,1,NO_DROPOUT,LEAKY_RELU,0);
    f1[1] = fully_connected(1024,1024,2,NO_DROPOUT,LEAKY_RELU,0);
    
    // initialize the 3 residual layers
    r1[0] = residual(192,52,52,4,c2);
    r2[0] = residual(192,26,26,4,c3);
    r3[0] = residual(192,13,13,4,c4);
    
    // use the leaky relu function as final activation function after each residual layer
    r1[0]->cl_output->activation_flag = LEAKY_RELU;
    r2[0]->cl_output->activation_flag = LEAKY_RELU;
    r3[0]->cl_output->activation_flag = LEAKY_RELU;
    
    // batch normalization and max pooling layers
    b1[0] = batch_normalization(batch_size,192*52*52,0,NO_ACTIVATION);
    b1[1] = batch_normalization(batch_size,192*26*26,1,NO_ACTIVATION);
    b1[2] = batch_normalization(batch_size,192*13*13,2,NO_ACTIVATION);
    
    c5[0] = convolutional(192,52,52,1,1,192,1,1,0,0,2,2,0,0,2,2,NO_NORMALIZATION,NO_ACTIVATION,MAX_POOLING,0,NO_CONVOLUTION);
    c6[0] = convolutional(192,26,26,1,1,192,1,1,0,0,2,2,0,0,2,2,NO_NORMALIZATION,NO_ACTIVATION,MAX_POOLING,0,NO_CONVOLUTION);
    c7[0] = convolutional(192,13,13,1,1,192,1,1,0,0,2,2,0,0,3,3,NO_NORMALIZATION,NO_ACTIVATION,MAX_POOLING,0,NO_CONVOLUTION);
    
    // create the networks, the temporary networks (es temp_m1) are used to handle the sum of the derivatives across the mini batches
    model* m1 = network(6,1,2,0,r1,c1,NULL);
    model* temp_m1 = copy_model(m1);
    model* m2 = network(5,1,1,0,r2,c5,NULL);
    model* temp_m2 = copy_model(m2);
    model* m3 = network(5,1,1,0,r3,c6,NULL);
    model* temp_m3 = copy_model(m3);
    model* m4 = network(3,0,1,2,NULL,c7,f1);
    model* temp_m4 = copy_model(m4);
    
    bmodel* bm = batch_network(3,0,0,0,3,NULL,NULL,NULL,b1);
    
    // the mini batch models
    model** batch_m1 = (model**)malloc(sizeof(model*)*batch_size);
    model** batch_m2 = (model**)malloc(sizeof(model*)*batch_size);
    model** batch_m3 = (model**)malloc(sizeof(model*)*batch_size);
    model** batch_m4 = (model**)malloc(sizeof(model*)*batch_size);
    
    for(i = 0; i < batch_size; i++){
        batch_m1[i] = copy_model(m1);
        batch_m2[i] = copy_model(m2);
        batch_m3[i] = copy_model(m3);
        batch_m4[i] = copy_model(m4);
    }
    
    // input vectors after the first second and third residual layer
    float** input_vectors1 = (float**)malloc(sizeof(float*)*batch_size);
    float** input_vectors2 = (float**)malloc(sizeof(float*)*batch_size);
    float** input_vectors3 = (float**)malloc(sizeof(float*)*batch_size);
    
    for(i = 0; i < batch_size; i++){
        input_vectors1[i] = batch_m1[i]->rls[0]->cl_output->post_activation;
        input_vectors2[i] = batch_m2[i]->rls[0]->cl_output->post_activation;
        input_vectors3[i] = batch_m3[i]->rls[0]->cl_output->post_activation;
    }

    /* Training */
    for(i = 0; i < n_instances/batch_size; i++){
        // Feed forward
        for(j = 0; j < batch_size; j++){
            model_tensor_input_ff(batch_m1[j],input_channels, input_rows, input_cols, input[i*batch_size+j]);
        }
        
        batch_normalization_feed_forward(batch_size, input_vectors1,b1[0]->temp_vectors, b1[0]->vector_dim, b1[0]->gamma, b1[0]->beta, b1[0]->mean, b1[0]->var, b1[0]->outputs,b1[0]->epsilon);
        
        for(j = 0; j < batch_size; j++){
            model_tensor_input_ff(batch_m2[j],batch_m2[j]->cls[0]->channels,batch_m2[j]->cls[0]->input_rows,batch_m2[j]->cls[0]->input_cols,b1[0]->outputs[j]);
        }
        
        batch_normalization_feed_forward(batch_size, input_vectors2,b1[1]->temp_vectors, b1[1]->vector_dim, b1[1]->gamma, b1[1]->beta, b1[1]->mean, b1[1]->var, b1[1]->outputs,b1[1]->epsilon);
        
        for(j = 0; j < batch_size; j++){
            model_tensor_input_ff(batch_m3[j],batch_m3[j]->cls[0]->channels,batch_m3[j]->cls[0]->input_rows,batch_m3[j]->cls[0]->input_cols,b1[1]->outputs[j]);
        }
        
        batch_normalization_feed_forward(batch_size, input_vectors3,b1[2]->temp_vectors, b1[2]->vector_dim, b1[2]->gamma, b1[2]->beta, b1[2]->mean, b1[2]->var, b1[2]->outputs,b1[2]->epsilon);
        
        for(j = 0; j < batch_size; j++){
            model_tensor_input_ff(batch_m4[j],batch_m4[j]->cls[0]->channels,batch_m4[j]->cls[0]->input_rows,batch_m4[j]->cls[0]->input_cols,b1[2]->outputs[j]);
        }
        
        // Set the errors
        float** error = (float**)malloc(sizeof(float*)*batch_size);
        
        for(j = 0; j < batch_size; j++){
            error[j] = (float*)calloc(batch_m4[0]->fcls[0]->output,sizeof(float));
        }
        
        // BackPropagation
        for(j = 0; j < batch_size; j++){
            error[j] = model_tensor_input_bp(batch_m4[j],batch_m4[j]->cls[0]->channels,batch_m4[j]->cls[0]->input_rows,batch_m4[j]->cls[0]->input_cols,b1[2]->outputs[j], error[j], batch_m4[j]->fcls[1]->output);
        }
        
        batch_normalization_back_prop(batch_size,input_vectors3,b1[2]->temp_vectors, b1[2]->vector_dim, b1[2]->gamma, b1[2]->beta, b1[2]->mean, b1[2]->var,error,b1[2]->d_gamma, b1[2]->d_beta,b1[2]->error2, b1[2]->temp1,b1[2]->temp2, b1[2]->epsilon);
        
        for(j = 0; j < batch_size; j++){
            // Computing derivative leaky relu
            derivative_leaky_relu_array(batch_m3[j]->rls[0]->cl_output->pre_activation,batch_m3[j]->rls[0]->cl_output->temp3,b1[2]->vector_dim);
            dot1D(batch_m3[j]->rls[0]->cl_output->temp3,b1[2]->error2[j],batch_m3[j]->rls[0]->cl_output->temp,b1[2]->vector_dim);
            error[j] = model_tensor_input_bp(batch_m3[j],batch_m3[j]->cls[0]->channels,batch_m3[j]->cls[0]->input_rows,batch_m3[j]->cls[0]->input_cols,b1[1]->outputs[j], batch_m3[j]->rls[0]->cl_output->temp, batch_m3[j]->rls[0]->cl_output->n_kernels*batch_m3[j]->rls[0]->cl_output->rows1*batch_m3[j]->rls[0]->cl_output->cols1);
        }
        
        batch_normalization_back_prop(batch_size,input_vectors2,b1[1]->temp_vectors, b1[1]->vector_dim, b1[1]->gamma, b1[1]->beta, b1[1]->mean, b1[1]->var,error,b1[1]->d_gamma, b1[1]->d_beta,b1[1]->error2, b1[1]->temp1,b1[1]->temp2, b1[1]->epsilon);
                
        for(j = 0; j < batch_size; j++){
            // Computing derivative leaky relu
            derivative_leaky_relu_array(batch_m2[j]->rls[0]->cl_output->pre_activation,batch_m2[j]->rls[0]->cl_output->temp3,b1[1]->vector_dim);
            dot1D(batch_m2[j]->rls[0]->cl_output->temp3,b1[1]->error2[j],batch_m2[j]->rls[0]->cl_output->temp,b1[1]->vector_dim);
            error[j] = model_tensor_input_bp(batch_m2[j],batch_m2[j]->cls[0]->channels,batch_m2[j]->cls[0]->input_rows,batch_m2[j]->cls[0]->input_cols,b1[0]->outputs[j], batch_m2[j]->rls[0]->cl_output->temp, batch_m2[j]->rls[0]->cl_output->n_kernels*batch_m2[j]->rls[0]->cl_output->rows1*batch_m2[j]->rls[0]->cl_output->cols1);
        }
        
        batch_normalization_back_prop(batch_size,input_vectors1,b1[0]->temp_vectors, b1[0]->vector_dim, b1[0]->gamma, b1[0]->beta, b1[0]->mean, b1[0]->var,error,b1[0]->d_gamma, b1[0]->d_beta,b1[0]->error2, b1[0]->temp1,b1[0]->temp2, b1[0]->epsilon);
        
        for(j = 0; j < batch_size; j++){
            // Computing derivative leaky relu
            derivative_leaky_relu_array(batch_m1[j]->rls[0]->cl_output->pre_activation,batch_m1[j]->rls[0]->cl_output->temp3,b1[0]->vector_dim);
            dot1D(batch_m1[j]->rls[0]->cl_output->temp3,b1[0]->error2[j],batch_m1[j]->rls[0]->cl_output->temp,b1[0]->vector_dim);
            error[j] = model_tensor_input_bp(batch_m1[j],input_channels,input_rows,input_cols,input[i*batch_size+j], batch_m1[j]->rls[0]->cl_output->temp, batch_m1[j]->rls[0]->cl_output->n_kernels*batch_m1[j]->rls[0]->cl_output->rows1*batch_m1[j]->rls[0]->cl_output->cols1);
        }
        
        for(j = 0; j < batch_size; j++){
            sum_model_partial_derivatives(temp_m1,batch_m1[j],temp_m1);
            sum_model_partial_derivatives(temp_m2,batch_m2[j],temp_m2);
            sum_model_partial_derivatives(temp_m3,batch_m3[j],temp_m3);
            sum_model_partial_derivatives(temp_m4,batch_m4[j],temp_m4);
        }
        
        update_bmodel(bm,lr,0.9,batch_size,ADAM,&b1_adam5,&b2_adam5,NO_REGULARIZATION,0,0);
        update_model(temp_m1,lr,0.9,batch_size,ADAM,&b1_adam1,&b2_adam1,NO_REGULARIZATION,0,0);
        update_model(temp_m2,lr,0.9,batch_size,ADAM,&b1_adam2,&b2_adam2,NO_REGULARIZATION,0,0);
        update_model(temp_m3,lr,0.9,batch_size,ADAM,&b1_adam3,&b2_adam3,NO_REGULARIZATION,0,0);
        update_model(temp_m4,lr,0.9,batch_size,ADAM,&b1_adam4,&b2_adam4,NO_REGULARIZATION,0,0);
        
        reset_bmodel(bm);
        reset_model(temp_m1);
        reset_model(temp_m2);
        reset_model(temp_m3);
        reset_model(temp_m4);
        
        for(j = 0; j < batch_size; j++){
            paste_model(temp_m1,batch_m1[j]);
            paste_model(temp_m2,batch_m2[j]);
            paste_model(temp_m3,batch_m3[j]);
            paste_model(temp_m4,batch_m4[j]);
        }
                
    }
 
    for(i = 0; i < n_instances; i++){
        free(input[i]);
    }
    free(input);
    
    free_model(temp_m1);
    free_model(temp_m2);
    free_model(temp_m3);
    free_model(temp_m4);
    free_model(m1);
    free_model(m2);
    free_model(m3);
    free_model(m4);
    free_bmodel(bm);
    
    for(i = 0; i < batch_size; i++){
        free_model(batch_m1[i]);
        free_model(batch_m2[i]);
        free_model(batch_m3[i]);
        free_model(batch_m4[i]);
    }
    
    free(batch_m1);
    free(batch_m2);
    free(batch_m3);
    free(batch_m4);
    free(input_vectors1);
    free(input_vectors2);
    free(input_vectors3);
    
    // The errors are not freed
}

Code Multi Thread

#include <stdio.h>
#include <stdlib.h>
#include "llab.h"

void read_input_output_from_files(float** input, float** output, int batch_size, char** files, char* directory);

int main(){
    
    /* First model with 6 convolutional layers* */
    /* Second model with 4 convolutional layers grouped up in a residual layer */
    /* Third model with 4 convolutional layers grouped up in a residual layer */
    srand(time(NULL));
    int output = 183;
    int input_channels = 3, input_rows = 640, input_cols = 640;
    int batch_size = 3,n_instances = 10;
    float lr, b1_adam1 = BETA1_ADAM, b2_adam1 = BETA2_ADAM;
    float b1_adam2 = BETA1_ADAM, b2_adam2 = BETA2_ADAM;
    float b1_adam3 = BETA1_ADAM, b2_adam3 = BETA2_ADAM;
    float b1_adam4 = BETA1_ADAM, b2_adam4 = BETA2_ADAM;
    float b1_adam5 = BETA1_ADAM, b2_adam5 = BETA2_ADAM;
    float b1_adam6 = BETA1_ADAM, b2_adam6 = BETA2_ADAM;
    float b1_adam7 = BETA1_ADAM, b2_adam7 = BETA2_ADAM;
    int i,j;
    float** input = (float**)malloc(sizeof(float*)*n_instances);
    
    for(i = 0; i < n_instances; i++){
        input[i] = (float*)calloc(input_channels*input_rows*input_cols,sizeof(float));
    }
        
    cl** c1 = (cl**)malloc(sizeof(cl*)*2);
    cl** c2 = (cl**)malloc(sizeof(cl*)*4);
    cl** c3 = (cl**)malloc(sizeof(cl*)*4);
    cl** c4 = (cl**)malloc(sizeof(cl*)*4);
    rl** r1 = (rl**)malloc(sizeof(rl*));
    rl** r2 = (rl**)malloc(sizeof(rl*));
    rl** r3 = (rl**)malloc(sizeof(rl*));
    fcl** f1 = (fcl**)malloc(sizeof(fcl*)*2);
    
    /* batch normalization and max-pooling after the 3 residual layers*/
    bn** b1 = (bn**)malloc(sizeof(bn*)*3);
    cl** c5 = (cl**)malloc(sizeof(cl*));
    cl** c6 = (cl**)malloc(sizeof(cl*));
    cl** c7 = (cl**)malloc(sizeof(cl*));
    
    // First 2 convolutional layers
    c1[0] = convolutional(3,640,640,7,7,64,3,3,0,0,2,2,0,0,2,2,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,MAX_POOLING,0,CONVOLUTION);
    c1[1] = convolutional(64,106,106,3,3,192,1,1,0,0,2,2,0,0,2,2,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,MAX_POOLING,1,CONVOLUTION);
    
    // 4 convolutional layer of first residual layer
    c2[0] = convolutional(192,52,52,1,1,128,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,2,CONVOLUTION);
    c2[1] = convolutional(128,52,52,3,3,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,3,CONVOLUTION);
    c2[2] = convolutional(256,50,50,1,1,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,4,CONVOLUTION);
    c2[3] = convolutional(256,50,50,3,3,192,1,1,2,2,1,1,0,0,0,0,NO_NORMALIZATION,LEAKY_RELU,NO_POOLING,5,CONVOLUTION);
    
    //batch normalization
    //max pooling 2x2 s = 2
    
    // 4 convolutional layer of second residual layer
    c3[0] = convolutional(192,26,26,1,1,128,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,1,CONVOLUTION);
    c3[1] = convolutional(128,26,26,3,3,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,2,CONVOLUTION);
    c3[2] = convolutional(256,24,24,1,1,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,3,CONVOLUTION);
    c3[3] = convolutional(256,24,24,3,3,192,1,1,2,2,1,1,0,0,0,0,NO_NORMALIZATION,LEAKY_RELU,NO_POOLING,4,CONVOLUTION);
    
    //batch normalization
    //max pooling 2x2 s = 2
    
    // 4 convolutional layer of third residual layer
    c4[0] = convolutional(192,13,13,1,1,128,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,1,CONVOLUTION);
    c4[1] = convolutional(128,13,13,3,3,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,2,CONVOLUTION);
    c4[2] = convolutional(256,11,11,1,1,256,1,1,0,0,1,1,0,0,0,0,LOCAL_RESPONSE_NORMALIZATION,LEAKY_RELU,NO_POOLING,3,CONVOLUTION);
    c4[3] = convolutional(256,11,11,3,3,192,1,1,2,2,1,1,0,0,0,0,NO_NORMALIZATION,LEAKY_RELU,NO_POOLING,4,CONVOLUTION);
    
    //batch normalization
    //max pooling 2x2 s = 2
    
    //192x6x6 total output at this moment
    
    // 2 terminal fully connected layers
    f1[0] = fully_connected(192*6*6,4096,1,NO_DROPOUT,LEAKY_RELU,0);
    f1[1] = fully_connected(4096,output,2,NO_DROPOUT,LEAKY_RELU,0);
    
    // initialize the 3 residual layers
    r1[0] = residual(192,52,52,4,c2);
    r2[0] = residual(192,26,26,4,c3);
    r3[0] = residual(192,13,13,4,c4);
    
    // use the leaky relu function as final activation function after each residual layer
    r1[0]->cl_output->activation_flag = LEAKY_RELU;
    r2[0]->cl_output->activation_flag = LEAKY_RELU;
    r3[0]->cl_output->activation_flag = LEAKY_RELU;
    
    // batch normalization and max pooling layers
    b1[0] = batch_normalization(batch_size,192*52*52,0,NO_ACTIVATION);
    b1[1] = batch_normalization(batch_size,192*26*26,1,NO_ACTIVATION);
    b1[2] = batch_normalization(batch_size,192*13*13,2,NO_ACTIVATION);
    
    c5[0] = convolutional(192,52,52,1,1,192,1,1,0,0,2,2,0,0,2,2,NO_NORMALIZATION,NO_ACTIVATION,MAX_POOLING,0,NO_CONVOLUTION);
    c6[0] = convolutional(192,26,26,1,1,192,1,1,0,0,2,2,0,0,2,2,NO_NORMALIZATION,NO_ACTIVATION,MAX_POOLING,0,NO_CONVOLUTION);
    c7[0] = convolutional(192,13,13,1,1,192,1,1,0,0,2,2,0,0,3,3,NO_NORMALIZATION,NO_ACTIVATION,MAX_POOLING,0,NO_CONVOLUTION);
    
    // create the networks, the temporary networks (es temp_m1) are used to handle the sum of the derivatives across the mini batches
    model* m1 = network(6,1,2,0,r1,c1,NULL);
    model* temp_m1 = copy_model(m1);
    model* m2 = network(5,1,1,0,r2,c5,NULL);
    model* temp_m2 = copy_model(m2);
    model* m3 = network(5,1,1,0,r3,c6,NULL);
    model* temp_m3 = copy_model(m3);
    model* m4 = network(3,0,1,2,NULL,c7,f1);
    model* temp_m4 = copy_model(m4);
    
    bmodel* bm = batch_network(3,0,0,0,3,NULL,NULL,NULL,b1);
    
    // the mini batch models
    model** batch_m1 = (model**)malloc(sizeof(model*)*batch_size);
    model** batch_m2 = (model**)malloc(sizeof(model*)*batch_size);
    model** batch_m3 = (model**)malloc(sizeof(model*)*batch_size);
    model** batch_m4 = (model**)malloc(sizeof(model*)*batch_size);
    
    for(i = 0; i < batch_size; i++){
        batch_m1[i] = copy_model(m1);
        batch_m2[i] = copy_model(m2);
        batch_m3[i] = copy_model(m3);
        batch_m4[i] = copy_model(m4);
    }
    
    // input vectors after the first second and third residual layer
    float** input_vectors1 = (float**)malloc(sizeof(float*)*batch_size);
    float** input_vectors2 = (float**)malloc(sizeof(float*)*batch_size);
    float** input_vectors3 = (float**)malloc(sizeof(float*)*batch_size);
    
    for(i = 0; i < batch_size; i++){
        input_vectors1[i] = batch_m1[i]->rls[0]->cl_output->post_activation;
        input_vectors2[i] = batch_m2[i]->rls[0]->cl_output->post_activation;
        input_vectors3[i] = batch_m3[i]->rls[0]->cl_output->post_activation;
    }
    
    // At the end of the convolutional model we add a recurrent model to predict each box and object in the image
    // After that we add a last fully connected layers with the output
    
    // this 2 networks are applied in another function and file
    
    int threads = 2;
    /* Training */
    for(i = 0; i < n_instances/batch_size; i++){
        // Feed forward

        model_tensor_input_ff_multicore(batch_m1,input_channels,input_rows,input_cols,&input[i], batch_size,threads);
        batch_normalization_feed_forward(batch_size, input_vectors1,b1[0]->temp_vectors, b1[0]->vector_dim, b1[0]->gamma, b1[0]->beta, b1[0]->mean, b1[0]->var, b1[0]->outputs,b1[0]->epsilon);
        model_tensor_input_ff_multicore(batch_m2,batch_m2[0]->cls[0]->channels,batch_m2[0]->cls[0]->input_rows,batch_m2[0]->cls[0]->input_cols,b1[0]->outputs, batch_size,threads);
        batch_normalization_feed_forward(batch_size, input_vectors2,b1[1]->temp_vectors, b1[1]->vector_dim, b1[1]->gamma, b1[1]->beta, b1[1]->mean, b1[1]->var, b1[1]->outputs,b1[1]->epsilon);
        model_tensor_input_ff_multicore(batch_m3,batch_m3[0]->cls[0]->channels,batch_m3[0]->cls[0]->input_rows,batch_m3[0]->cls[0]->input_cols,b1[1]->outputs, batch_size,threads);
        batch_normalization_feed_forward(batch_size, input_vectors3,b1[2]->temp_vectors, b1[2]->vector_dim, b1[2]->gamma, b1[2]->beta, b1[2]->mean, b1[2]->var, b1[2]->outputs,b1[2]->epsilon);
        model_tensor_input_ff_multicore(batch_m4,batch_m4[0]->cls[0]->channels,batch_m4[0]->cls[0]->input_rows,batch_m4[0]->cls[0]->input_cols,b1[2]->outputs, batch_size,threads);
        
        // Set the errors
        float** error = (float**)malloc(sizeof(float*)*batch_size);
        
        for(j = 0; j < batch_size; j++){
            error[j] = (float*)calloc(output,sizeof(float));
        }
        
        // BackPropagation
        
        model_tensor_input_bp_multicore(batch_m4,batch_m4[0]->cls[0]->channels,batch_m4[0]->cls[0]->input_rows,batch_m4[0]->cls[0]->input_cols,b1[2]->outputs,batch_size,threads,error,output,error);
        batch_normalization_back_prop(batch_size,input_vectors3,b1[2]->temp_vectors, b1[2]->vector_dim, b1[2]->gamma, b1[2]->beta, b1[2]->mean, b1[2]->var,error,b1[2]->d_gamma, b1[2]->d_beta,b1[2]->error2, b1[2]->temp1,b1[2]->temp2, b1[2]->epsilon);
        
        for(j = 0; j < batch_size; j++){
            // Computing derivative leaky relu
            derivative_leaky_relu_array(batch_m3[j]->rls[0]->cl_output->pre_activation,batch_m3[j]->rls[0]->cl_output->temp3,b1[2]->vector_dim);
            dot1D(batch_m3[j]->rls[0]->cl_output->temp3,b1[2]->error2[j],batch_m3[j]->rls[0]->cl_output->temp,b1[2]->vector_dim);
            error[j] = batch_m3[j]->rls[0]->cl_output->temp;
        }
        j = 0;
        
        model_tensor_input_bp_multicore(batch_m3,batch_m3[0]->cls[0]->channels,batch_m3[0]->cls[0]->input_rows,batch_m3[0]->cls[0]->input_cols,b1[1]->outputs,batch_size,threads,error,batch_m3[j]->rls[0]->cl_output->n_kernels*batch_m3[j]->rls[0]->cl_output->rows1*batch_m3[j]->rls[0]->cl_output->cols1,error);
        batch_normalization_back_prop(batch_size,input_vectors2,b1[1]->temp_vectors, b1[1]->vector_dim, b1[1]->gamma, b1[1]->beta, b1[1]->mean, b1[1]->var,error,b1[1]->d_gamma, b1[1]->d_beta,b1[1]->error2, b1[1]->temp1,b1[1]->temp2, b1[1]->epsilon);
        
        for(j = 0; j < batch_size; j++){
            // Computing derivative leaky relu
            derivative_leaky_relu_array(batch_m2[j]->rls[0]->cl_output->pre_activation,batch_m2[j]->rls[0]->cl_output->temp3,b1[1]->vector_dim);
            dot1D(batch_m2[j]->rls[0]->cl_output->temp3,b1[1]->error2[j],batch_m2[j]->rls[0]->cl_output->temp,b1[1]->vector_dim);
            error[j] = batch_m2[j]->rls[0]->cl_output->temp;
        }
        j = 0;
        model_tensor_input_bp_multicore(batch_m2,batch_m2[0]->cls[0]->channels,batch_m2[0]->cls[0]->input_rows,batch_m2[0]->cls[0]->input_cols,b1[0]->outputs,batch_size,threads,error,batch_m2[j]->rls[0]->cl_output->n_kernels*batch_m2[j]->rls[0]->cl_output->rows1*batch_m2[j]->rls[0]->cl_output->cols1,error);
        batch_normalization_back_prop(batch_size,input_vectors1,b1[0]->temp_vectors, b1[0]->vector_dim, b1[0]->gamma, b1[0]->beta, b1[0]->mean, b1[0]->var,error,b1[0]->d_gamma, b1[0]->d_beta,b1[0]->error2, b1[0]->temp1,b1[0]->temp2, b1[0]->epsilon);

        
        for(j = 0; j < batch_size; j++){
            derivative_leaky_relu_array(batch_m1[j]->rls[0]->cl_output->pre_activation,batch_m1[j]->rls[0]->cl_output->temp3,b1[0]->vector_dim);
            dot1D(batch_m1[j]->rls[0]->cl_output->temp3,b1[0]->error2[j],batch_m1[j]->rls[0]->cl_output->temp,b1[0]->vector_dim);
            error[j] = batch_m1[j]->rls[0]->cl_output->temp;
        }
        j = 0;
        model_tensor_input_bp_multicore(batch_m1,input_channels,input_rows,input_cols,&input[i*batch_size],batch_size,threads,error,batch_m1[j]->rls[0]->cl_output->n_kernels*batch_m1[j]->rls[0]->cl_output->rows1*batch_m1[j]->rls[0]->cl_output->cols1,error);

        for(j = 0; j < batch_size; j++){
            sum_model_partial_derivatives(temp_m1,batch_m1[j],temp_m1);
            sum_model_partial_derivatives(temp_m2,batch_m2[j],temp_m2);
            sum_model_partial_derivatives(temp_m3,batch_m3[j],temp_m3);
            sum_model_partial_derivatives(temp_m4,batch_m4[j],temp_m4);
        }
        
        update_bmodel(bm,lr,0.9,batch_size,ADAM,&b1_adam5,&b2_adam5,NO_REGULARIZATION,0,0);
        update_model(temp_m1,lr,0.9,batch_size,ADAM,&b1_adam1,&b2_adam1,NO_REGULARIZATION,0,0);
        update_model(temp_m2,lr,0.9,batch_size,ADAM,&b1_adam2,&b2_adam2,NO_REGULARIZATION,0,0);
        update_model(temp_m3,lr,0.9,batch_size,ADAM,&b1_adam3,&b2_adam3,NO_REGULARIZATION,0,0);
        update_model(temp_m4,lr,0.9,batch_size,ADAM,&b1_adam4,&b2_adam4,NO_REGULARIZATION,0,0);
        
        reset_bmodel(bm);
        reset_model(temp_m1);
        reset_model(temp_m2);
        reset_model(temp_m3);
        reset_model(temp_m4);
        
        for(j = 0; j < batch_size; j++){
            paste_model(temp_m1,batch_m1[j]);
            paste_model(temp_m2,batch_m2[j]);
            paste_model(temp_m3,batch_m3[j]);
            paste_model(temp_m4,batch_m4[j]);
        }
                
    }
 
    for(i = 0; i < n_instances; i++){
        free(input[i]);
    }
    free(input);
    
    free_model(temp_m1);
    free_model(temp_m2);
    free_model(temp_m3);
    free_model(temp_m4);
    free_model(m1);
    free_model(m2);
    free_model(m3);
    free_model(m4);
    free_bmodel(bm);
    
    for(i = 0; i < batch_size; i++){
        free_model(batch_m1[i]);
        free_model(batch_m2[i]);
        free_model(batch_m3[i]);
        free_model(batch_m4[i]);
    }
    
    free(batch_m1);
    free(batch_m2);
    free(batch_m3);
    free(batch_m4);
    free(input_vectors1);
    free(input_vectors2);
    free(input_vectors3);
    
    // The errors are not freed
}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Deep Network with Batch Normalization Training Example [Single Thread] [Multi Thread]

Network Architecture

Code Single Thread

Code Multi Thread

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally