Source code for pyseqlab.crf_learning

'''
@author: Ahmed Allam <ahmed.allam@yale.edu>
'''

import os
from datetime import datetime
import numpy
from .utilities import ReaderWriter, create_directory, generate_datetime_str, vectorized_logsumexp

[docs]class Learner(object):
    """learner used for training CRF models supporting search- and gradient-based learning methods
    
       Args:
           crf_model: an instance of CRF models such as :class:`HOCRFAD`
       
       Keyword Arguments:
           crf_model: an instance of CRF models such as :class:`HOCRFAD`
           training_description: dictionary that will include the training specification
                                 of the model

    """
    def __init__(self, crf_model):
        self.crf_model = crf_model
        self.training_description = None
    
[docs]    def train_model(self, w0, seqs_id, optimization_options, working_dir, save_model = True):
        r"""the **MAIN** method for training models using the various options available
        
           Args:
               w0: numpy vector representing initial weights for the parameters
               seqs_id: list of integers representing the sequence ids
               optimization_options: dictionary specifying the training method
               working_dir: string representing the directory where the model data
                            and generated files will be saved
                            
           Keyword Arguments:
               save_model: boolean specifying if to save the final model
               
           Example:
               
               The available options for training are:
                   - `SGA` for stochastic gradient ascent
                   - `SGA-ADADELTA` for stochastic gradient ascent using ADADELTA approach
                   - `BFGS` or `L-BFGS-B` for optimization using second order information (hessian matrix)
                   - `SVRG` for stochastic variance reduced gradient method
                   - `COLLINS-PERCEPTRON` for structured perceptron
                   - `SAPO` for Search-based Probabilistic Online Learning Algorithm (SAPO) (an adapted version)
               
           For example possible specification of the optimization options are:
               
               ::
               
                    1) {'method': 'SGA-ADADELTA'
                       'regularization_type': {'l1', 'l2'}
                       'regularization_value': float
                       'num_epochs': integer
                       'tolerance': float
                       'rho': float
                       'epsilon': float
                      }
                      
                      
                   2) {'method': 'SGA' or 'SVRG'
                       'regularization_type': {'l1', 'l2'}
                       'regularization_value': float
                       'num_epochs': integer
                       'tolerance': float
                       'learning_rate_schedule': one of ("bottu", "exponential_decay", "t_inverse", "constant")
                       't0': float
                       'alpha': float
                       'eta0': float
                      }
                                   
                                  
                   3) {'method': 'L-BFGS-B' or 'BFGS'
                       'regularization_type': 'l2'
                       'regularization_value': float
                       'disp': False
                       'maxls': 20,
                       'iprint': -1,
                       'gtol': 1e-05,
                       'eps': 1e-08, 
                       'maxiter': 15000, 
                       'ftol': 2.220446049250313e-09, 
                       'maxcor': 10, 
                       'maxfun': 15000
                       }
                
                
                   4) {'method': 'COLLINS-PERCEPTRON'
                       'regularization_type': {'l1', 'l2'}
                       'regularization_value': float
                       'num_epochs': integer
                       'update_type':{'early', 'max-fast', 'max-exhaustive', 'latest'}
                       'shuffle_seq': boolean
                       'beam_size': integer
                       'avg_scheme': {'avg_error', 'avg_uniform'}
                       'tolerance': float
                      }
                      
                      
                   5) {'method': 'SAPO'
                       'regularization_type': {'l2'}
                       'regularization_value': float
                       'num_epochs': integer
                       'update_type':'early'
                       'shuffle_seq': boolean
                       'beam_size': integer
                       'topK': integer
                       'tolerance': float
                      }
                         
        """

        pop_keys = set()
        
        lambda_type = optimization_options.get("regularization_type")
        pop_keys.add("regularization_type")
        if(lambda_type not in {'l1', 'l2'}):
            # default regularization type is l2
            lambda_type = 'l2'
            #^print("regularization by default is l2")

        # get the regularization parameter value
        lambda_val = optimization_options.get("regularization_value")
        pop_keys.add("regularization_value")
        
        if(lambda_val == None):
            # assign default lambda value
            lambda_val = 0.0
        elif(lambda_val < 0):
            # regularization should be positive
            lambda_val = 0.0

        # initialization of weight vector w node
#         w0 = numpy.zeros(len(self.weights))
        method = optimization_options.get("method")
        pop_keys.add("method")
        if(method not in {"L-BFGS-B", "BFGS", "SGA","SGA-ADADELTA","SVRG","COLLINS-PERCEPTRON", "SAPO"}):
            # default weight learning/optimization method
            method = "SGA-ADADELTA"
            
        if(method in {"L-BFGS-B", "BFGS"}):
            # initialize the new optimization options
            option_keys = set(optimization_options.keys()) - pop_keys
            options = {elmkey:optimization_options[elmkey] for elmkey in option_keys}
            optimization_config = {'method':method,
                                   'regularization_value':lambda_val,
                                   'regularization_type':'l2',
                                   'options':options
                                   }
            estimate_weights = self._optimize_scipy

            
        elif(method in {"SGA", "SGA-ADADELTA", "SVRG", "COLLINS-PERCEPTRON", "SAPO"}):
            num_epochs = optimization_options.get("num_epochs")
            if(type(num_epochs) != int):
                # default number of epochs if not specified
                num_epochs = 3
            elif(num_epochs < 0):
                # num_epochs should be positive
                num_epochs = 3
                
            tolerance = optimization_options.get("tolerance")
            if(tolerance == None):
                # default value of tolerance if not specified
                tolerance = 1e-8
            elif(tolerance < 0):
                tolerance = 1e-8
                
            optimization_config = {'method': method,
                                   'regularization_type':lambda_type,
                                   'regularization_value': lambda_val,
                                   'num_epochs': num_epochs,
                                   'tolerance': tolerance
                                   }
            
            if(method in {"COLLINS-PERCEPTRON", "SAPO"}):

                # if segmentation problem the non-entity symbol is specified using this option else it is None
                seg_other_symbol = optimization_options.get("seg_other_symbol")
                optimization_config['seg_other_symbol'] = seg_other_symbol    
                # setting beam size
                beam_size = optimization_options.get("beam_size")
                # default beam size
                default_beam = len(self.crf_model.model.Y_codebook)
                if(type(beam_size) != int):
                    beam_size = default_beam
                elif(beam_size <= 0 or beam_size > default_beam):
                    beam_size = default_beam
                optimization_config["beam_size"] = beam_size
                self.crf_model.beam_size = beam_size

                # setting update type 
                update_type = optimization_options.get("update_type")
                if(update_type not in {'early', 'latest', 'max-exhaustive', 'max-fast'}):
                    update_type = 'early'
                optimization_config["update_type"] = update_type
                
                # setting shuffle_seq
                shuffle_seq = optimization_options.get("shuffle_seq")
                if(type(shuffle_seq) != bool):
                    shuffle_seq = False
                optimization_config["shuffle_seq"] = shuffle_seq
                if(method == "COLLINS-PERCEPTRON"):
                    # getting averaging scheme
                    avg_scheme = optimization_options.get("avg_scheme")
                    if(avg_scheme not in ("avg_uniform", "avg_error", "survival")):
                        avg_scheme = "avg_error"
                    optimization_config["avg_scheme"] = avg_scheme
                    estimate_weights = self._structured_perceptron
                else:
                    # getting gamma (i.e. learning rate)
                    gamma = optimization_options.get("gamma")
                    if(gamma == None):
                        # use default value
                        gamma = 1
                    elif(gamma < 0):
                        gamma = 1
                    optimization_config['gamma'] = gamma
                    # getting topK (i.e. top-K decoded sequences)
                    topK = optimization_options.get("topK")
                    if(topK == None):
                        # use default value
                        topK = 5
                    elif(topK < 0):
                        topK = 5
                    optimization_config['topK'] = topK
                    estimate_weights = self._sapo


            elif(method in {"SGA", "SVRG"}):
            
                # get the other parameters to be tuned such as t0 and alpha
                learning_rate_schedule = optimization_options.get("learning_rate_schedule")
                if(learning_rate_schedule not in {"bottu", "exponential_decay", "t_inverse", "constant"}):
                    # default learning rate schedule
                    learning_rate_schedule = "t_inverse"
                optimization_config["learning_rate_schedule"] = learning_rate_schedule
                
                t0 = optimization_options.get("t0")
                if(t0 == None):
                    # use default value
                    t0 = 0.1
                elif(t0 < 0):
                    t0 = 0.1
                optimization_config['t0'] = t0
    
                if(learning_rate_schedule in {"t_inverse", "exponential_decay"}):
                    # get the alpha parameter
                    a = optimization_options.get("a")
                    if(a == None):
                        # use a default value
                        a = 0.9
                    elif(a <= 0 or a >= 1):
                        a = 0.9
                    optimization_config['a'] = a

                if(method == "SGA"):
                    estimate_weights = self._sga_classic   
                else:
                    estimate_weights = self._sga_svrg     
                                
            elif(method == "SGA-ADADELTA"):
                estimate_weights = self._sga_adadelta
                
                p_rho = optimization_options.get("p_rho")
                if(p_rho == None):
                    # default value
                    p_rho = 0.95
                elif(p_rho < 0):
                    # num_epochs should be positive
                    p_rho = 0.95
                    
                epsilon = optimization_options.get("epsilon")
                if(epsilon == None):
                    # default value of tolerance if not specified
                    epsilon = 1e-6
                elif(epsilon < 0):
                    epsilon = 1e-6
                optimization_config['p_rho'] = p_rho
                optimization_config['epsilon'] = epsilon

        # save the training options
        self.training_description = optimization_config
        model_foldername = generate_datetime_str()
        model_dir = create_directory(model_foldername, create_directory("models", working_dir))
        model_name = model_foldername + ".model"
        self.training_description["model_dir"] = model_dir
        self.training_description["model_name"] = model_name
        self.training_description["train_seqs_id"] = seqs_id
        
        # if everything is defined correctly then estimate the parameters
        w_hat = estimate_weights(w0, seqs_id)
        # update model weights to w_hat
        self.crf_model.weights = w_hat
        if(save_model):
            # pickle the model
            modelparts_dir = create_directory("model_parts", model_dir)
            self.crf_model.save_model(modelparts_dir)
            
        # cleanup the instance variables
        self.cleanup()
   
            
    def _report_training(self):
        """report training by logging the description to a file"""
        method = self.training_description["method"]
        regularization_type = self.training_description["regularization_type"]
        # regularization parameter lambda
        C = self.training_description['regularization_value']
        model_dir = self.training_description["model_dir"]
        model_name = self.training_description["model_name"]
        # log file 
        log_file = os.path.join(model_dir, "crf_training_log.txt")
        line = "---Model training-- starting time {} \n".format(datetime.now())
        line += "model name: {} \n".format(model_name)
        line += "model directory: {} \n".format(model_dir)
        line += "model type: {} \n".format(self.crf_model.__class__)
        line += "training method: {} \n".format(method)
        if(C):
            line += "type of regularization: {} \n".format(regularization_type)
            line += "value of regularization: {} \n".format(C)
        
        if(method  == "SGA"):
            learning_rate_schedule = self.training_description["learning_rate_schedule"]
            t0 = self.training_description["t0"]
            line += "learning rate schedule: {} \n".format(learning_rate_schedule)
            line += "eta0: {} \n".format(t0)
            if(learning_rate_schedule in ("t_inverse", "exponential_decay")):
                # get the alpha parameter
                a = self.training_description["a"]
                line += "a: {} \n".format(a)
        elif(method == "SGA-ADADELTA"):
            rho = self.training_description["p_rho"]
            epsilon = self.training_description["epsilon"]
            line += "p_rho: {} \n".format(rho)
            line += "epsilon: {} \n".format(epsilon)
        elif(method in {"SAPO", "COLLINS-PERCEPTRON"}):
            update_type = self.training_description['update_type']
            beam_size = self.training_description['beam_size']
            shuffle_seq = self.training_description['shuffle_seq']
            line += "update_type: {} \n".format(update_type)
            line += "beam_size: {} \n".format(beam_size)
            line += "shuffle_seq: {} \n".format(shuffle_seq)
            if(method == "COLLINS-PERCEPTRON"):
                avg_scheme = self.training_description["avg_scheme"]
                line += "averaging scheme: {} \n".format(avg_scheme)
            else:
                gamma = self.training_description['gamma']
                topK = self.training_description['topK']
                line += "gamma (learning rate): {} \n".format(gamma)
                line += "topK (number of top decoded seqs): {} \n".format(topK)
                
        if(method not in ("L-BFGS-B", "BFGS")):
            line += "number of epochs: {} \n".format(self.training_description['num_epochs'])
        # write to file    
        ReaderWriter.log_progress(line, log_file)
        
    def _check_reldiff(self, x, y):
        """calculate relative difference between two numbers
           
           Ars:
               x: float
               y: float
        """
        tolerance = self.training_description["tolerance"]
        if(numpy.abs(y)<=tolerance):
            self._exitloop = True
        else:
            if(x != y):            
                reldiff = numpy.abs(x - y) / (numpy.abs(x) + numpy.abs(y))
                #print("reldiff = {}".format(reldiff))
                if(reldiff <= tolerance):
                    self._exitloop = True
                else:
                    self._exitloop = False 


    def _optscipy_seqs_loglikelihood(self, w, seqs_id):
        """compute seqs loglikelihood when using the BFGS and L-BFGS-B optimization options
        
           Args:
               w: weight vector (numpy vector)
               seqs_id: list of integers representing ids assigned to the sequence
        
        """
        crf_model = self.crf_model
        seqs_loglikelihood = crf_model.compute_seqs_loglikelihood(w, seqs_id)
        # clear cached info 
        crf_model.clear_cached_info(seqs_id)
        # check for regularization parameter
        l2 = self.training_description["regularization_value"]
        if(l2>0):
            # log(p(Y|X;w)) - lambda/2 * ||w||**2
            seqs_loglikelihood = seqs_loglikelihood - ((l2/2) * numpy.dot(w, w))
        # since the optimization will be based on minimization, hence we multiply by -1
        seqs_loglikelihood = seqs_loglikelihood * -1
        return(seqs_loglikelihood)


    def _optscipy_seqs_gradient(self, w, seqs_id):
        """compute seqs gradient when using the BFGS and L-BFGS-B optimization options
        
           Args:
               w: weight vector (numpy vector)
               seqs_id: list of integers representing ids assigned to the sequence
        
        """
        crf_model = self.crf_model
        seqs_grad = crf_model.compute_seqs_gradient(w, seqs_id)
        # clear cached info 
        crf_model.clear_cached_info(seqs_id)
        l2 = self.training_description["regularization_value"]
        if(l2>0):
            seqs_grad = seqs_grad - (l2 * w)
        # since the optimization will be based on minimization, hence we multiply by -1
        seqs_grad = seqs_grad * -1
        return(seqs_grad)
    
  
    def _optimize_scipy(self, w, train_seqs_id):
        """estimate the parameters w of the model using `scipy optimize function`
            
           it uses `optimize.minimize()` function from the scipy package
           
           Args:
               w: weight vector (numpy vector)
               train_seqs_id: list of integers representing ids of the training sequences
        
        """
        from scipy import optimize
        self._report_training() 
        objfunc = self._optscipy_seqs_loglikelihood
        gradfunc = self._optscipy_seqs_gradient
        method = self.training_description["method"]
        options = self.training_description['options']
  
        # to keep track of elapsed time between optimization iterations
        self._elapsed_time = datetime.now()
        self._iter_count = 0
        result = optimize.minimize(fun = objfunc,
                                   x0 = w,
                                   args = (train_seqs_id),
                                   method = method,
                                   jac = gradfunc,
                                   options = options,
                                   callback = self._track_scipy_optimizer)
          
        model_dir = self.training_description["model_dir"]
        # log file 
        log_file = os.path.join(model_dir, "crf_training_log.txt")
        line = "---Model training--- end time {} \n".format(datetime.now())
        line += "\n \n"
        ReaderWriter.log_progress(line, log_file)
          
#         print("results \n {}".format(result))
        print("success: ", result['success'])
#         print(result.keys())
 
        # estimated optimal weights
        w_hat = result.x
          
        return(w_hat)
    
    def _track_scipy_optimizer(self, w):
        """track scipy optimization by logging each iteration 
        
           Args:
               w: weight vector (numpy vector)

        """
        
        # increment iteration count
        self._iter_count += 1
        delta_time = datetime.now() - self._elapsed_time 
        crf_model = self.crf_model
        # approximate estimation of sum of loglikelihood -- using previous weights
        train_seqs_id = self.training_description["train_seqs_id"]
        seqs_loglikelihood = 0
        for seq_id in train_seqs_id:
            seq_loglikelihood = crf_model.seqs_info[seq_id]["loglikelihood"]
            seqs_loglikelihood += seq_loglikelihood
        seqs_loglikelihood *= -1 
          
        """ use the below command >> to compute the sum of sequences' loglikelihood using the updated/current weights
            the sum should be decreasing after each iteration for successful training (used as diagnostics)
            however it is expensive/costly to recompute
            
            >>> seqs_loglikelihood = crf_model.compute_seqs_loglikelihood(w, train_seqs_id)
        """
        model_dir = self.training_description["model_dir"]
        log_file = os.path.join(model_dir, "crf_training_log.txt")
        line = "--- Iteration {} --- \n".format(self._iter_count)
        line += "Estimated average negative loglikelihood is {} \n".format(seqs_loglikelihood)
        line += "Number of seconds spent: {} \n".format(delta_time.total_seconds())
        ReaderWriter.log_progress(line, log_file)
        self._elapsed_time = datetime.now()
        print("iteration ", self._iter_count)


    
    def _identify_violation_indx(self, viol_indx, y_ref_boundaries):
        """determine the index where the violation occurs
        
           violation means when the reference state falls off the specified beam while decoding
           
           Args:
               viol_indx: list of indices where violation occurred while decoding
               y_ref_boundaries: boundaries of the labels/tags in the reference sequence
        """
        # viol_index is 1-based indexing
        counter = 0
        for boundary in y_ref_boundaries:
            __, v = boundary
            if(v >= viol_indx):
                viol_pos = v
                viol_boundindex = counter + 1
                break
            counter+= 1
        return(viol_pos, viol_boundindex)
    
    def _compute_seq_decerror(self, y_ref, y_imposter, viol_pos):
        """compute the decoding error of a sequence
        
           Args:
               y_ref: reference sequence list of labels 
               y_imposter: imposter/decoded sequence list of labels
               viol_pos: index where violation occurred, 
                         it is identified using :func:`_identify_violation_indx` function
        """
#         print("yref ", y_ref)
#         print("y_imposter ", y_imposter)
#         print("viol_pos ", viol_pos)
        T = len(y_ref[:viol_pos])
        #^print("T ", T)
        #^print("viol_pos ", viol_pos)
        missmatch = [i for i in range(T) if y_ref[i] != y_imposter[i]]
        len_diff = len(missmatch)
        # range of error is [0-1]
        seq_err_count = float(len_diff/T)
        return(seq_err_count)

    def _unpack_windxfval(self, y_windxfval):
        """unpack the weight indices and corresponding feature values 
        
           Args:
               y_windxfval: tuple having two numpy array entries; the first representing
                            the weight indices of the features while the second representing
                            the values that are feature sum/count
        """
        windx, fval = y_windxfval
        return(windx, fval)

    def _find_update_violation(self, w, seq_id):
        """determine the *best* imposter sequence for weight updates
        
           Args:
               w: weight vector (numpy vector)
               seq_id: integer representing unique id assigned to the sequence
        """
        method = self.training_description['method']
        beam_size = self.training_description['beam_size']
        update_type = self.training_description['update_type']
        topK = self.training_description.get('topK')
        crf_model = self.crf_model
        seqs_info = crf_model.seqs_info
        l = {'Y':(seq_id, )}
        crf_model.check_cached_info(seq_id, l)
        y_ref = seqs_info[seq_id]['Y']['flat_y']
        y_ref_boundaries = seqs_info[seq_id]['Y']['boundaries']
        
        if(update_type in {'max-fast', 'max-exhaustive', 'latest'}):
            early_stop = False
        else:
            early_stop = True
        
        if(not topK):
            y_imposter, viol_indx = crf_model.viterbi(w, seq_id, beam_size, early_stop, y_ref)
            y_imposters = [y_imposter]        
        else:
            y_imposters, viol_indx = crf_model.viterbi(w, seq_id, beam_size, early_stop, y_ref, topK)

        seq_err_count = None
        ref_unp_windxfval = None
        imps_unp_windxfval = None
        
        #^print("y_ref ", y_ref)
        #^print("y_imposter ", y_imposter)
        # top decoded sequence
        y_imposter = y_imposters[0]
        if(not viol_indx):
            # we can perform full update
            print("in full update routine ...")
            T = seqs_info[seq_id]['T']
            seq_err_count = self._compute_seq_decerror(y_ref, y_imposter, T)
            if(seq_err_count or method == "SAPO"):       
                ref_unp_windxfval, imps_unp_windxfval = self._load_gfeatures(seq_id, "globalfeatures", y_imposters, T, len(y_ref_boundaries))             

        else:
            if(update_type == "early"):
                print("in early update routine ...")
                # viol_index is 1-based indexing
                earlyviol_indx = viol_indx[0]
                viol_pos, viol_boundindex = self._identify_violation_indx(earlyviol_indx, y_ref_boundaries)
                seq_err_count = self._compute_seq_decerror(y_ref, y_imposter, viol_pos)
                ref_unp_windxfval, imps_unp_windxfval = self._load_gfeatures(seq_id, "globalfeatures_per_boundary", y_imposters, viol_pos, viol_boundindex)

            elif(update_type == "max-exhaustive"):
                # max update is only supported for one imposter sequence
                max_diff = numpy.inf
                L = crf_model.model.L
                print("in max-exhaustive update routine ...")
                test = []
                # viol_index is 1-based indexing
                for i in range(len(viol_indx)):
                    indx = viol_indx[i]
                    if(i == 0):
                        # case of early update index
                        if(L > 1):
                            viol_pos, viol_boundindex = self._identify_violation_indx(indx, y_ref_boundaries)
                        else:
                            viol_pos = indx
                            viol_boundindex = viol_pos
                        seq_err_count = self._compute_seq_decerror(y_ref, y_imposter, viol_pos)
                    else:
                        if(L>1):
                            __, v = y_ref_boundaries[viol_boundindex]
                            viol_pos = v
                            viol_boundindex += 1
                        else:
                            viol_pos = indx
                            viol_boundindex = viol_pos
#                     seq_err_count = self._compute_seq_decerror(y_ref, y_imposter, viol_pos)
                    ref_unp_windxfval, imps_unp_windxfval = self._load_gfeatures(seq_id, "globalfeatures_per_boundary", y_imposters, viol_pos, viol_boundindex)
                    ref_windx, ref_fval = ref_unp_windxfval
                    imp_windx, imp_fval = imps_unp_windxfval[0]
                    
                    diff = numpy.dot(w[ref_windx], ref_fval) - numpy.dot(w[imp_windx], imp_fval)
                    test.append(diff)
#                     print("diff = {}, max_diff = {} ".format(diff, max_diff))
                    if(diff <= max_diff):
                        # using less than or equal would allow for getting the longest sequence having max difference
                        max_diff = diff
                        ref_unp_windxfval = (ref_windx, ref_fval)
                        imp_unp_windxfval = (imp_windx, imp_fval)
                imps_unp_windxfval = [imp_unp_windxfval]
#                 print("test ", test)
            elif(update_type == "max-fast"):
                # based on empirical observation, the last violation index (i.e. where the beam falls off) is almost always yielding the max violation
                # this is a heuristic, for an exhaustive procedure, choose `max-exhaustive`
                # max update is only supported for one imposter sequence
                max_diff = numpy.inf
                L = crf_model.model.L
                print("in max-fast update routine ...")
                # viol_index is 1-based indexing
                lastviol_indx = viol_indx[-1]
                viol_pos, viol_boundindex = self._identify_violation_indx(lastviol_indx, y_ref_boundaries)
                seq_err_count = self._compute_seq_decerror(y_ref, y_imposter, viol_pos)
                ref_unp_windxfval, imps_unp_windxfval = self._load_gfeatures(seq_id, "globalfeatures_per_boundary", y_imposters, viol_pos, viol_boundindex)
            elif(update_type == 'latest'):
                # to implement lastest update at some point..
                pass
                        
        return(ref_unp_windxfval, imps_unp_windxfval, seq_err_count)   
    

        
    def _load_gfeatures(self, seq_id, gfeatures_type, y_imposters, ypos_indx, boundpos_indx):
        """load the global features of the reference and imposter/decoded sequence
        
           Args:
               seq_id: id of the sequence
               gfeatures_type: determine the representation either aggregated or by boundary
               y_imposters: list of imposter sequences
               ypos_indx: index of the considered end of the label sequence
               boundpos_indx: index of the boundary corresponding to the identified `ypos_indx`
        """
        seg_other_symbol = self.training_description['seg_other_symbol']
        crf_model = self.crf_model
        seqs_info = crf_model.seqs_info
        y_ref_boundaries = seqs_info[seq_id]['Y']['boundaries']
        if(gfeatures_type == "globalfeatures"):
            per_boundary = False
            y_ref_boundaries = None
        else:
            per_boundary = True
            # to assign y_ref_boundries here -> y_ref_boundaries = y_ref_boundaries[:boundpos_indx]
        l = {gfeatures_type:(seq_id, per_boundary)}
        crf_model.check_cached_info(seq_id, l)
        ref_gfeatures = seqs_info[seq_id][gfeatures_type]
        if(y_ref_boundaries):
            y_ref_windxfval = crf_model.represent_globalfeature(ref_gfeatures, y_ref_boundaries[:boundpos_indx])
        else:
            y_ref_windxfval = seqs_info[seq_id][gfeatures_type]

        #ref_unp_windxfval = self._unpack_windxfval(y_ref_windxfval)
        # generate global features for the imposters
        imposters_windxfval = []
        for y_imposter in y_imposters:
            # generate global features for the current imposter 
            imposter_gfeatures_perboundary, y_imposter_boundaries = crf_model.load_imposter_globalfeatures(seq_id, y_imposter[:ypos_indx], seg_other_symbol)                     
            #^print("imposter_gfeatures_perboundary ", imposter_gfeatures_perboundary)
            #^print("imposter y_boundaries ", y_imposter_boundaries)
            y_imposter_windxfval = crf_model.represent_globalfeature(imposter_gfeatures_perboundary, y_imposter_boundaries)
            imposters_windxfval.append(y_imposter_windxfval)
        
        return(y_ref_windxfval, imposters_windxfval)   

    
    def _update_weights_sapo(self, w, ref_unp_windxfval, imps_unp_windxfval, prob_vec):
        """update weight vector for the SAPO method
        
           Args:
               w: weight vector (numpy vector)
               ref_unp_windxfval: tuple of two numpy array elements representing the weight indices
                                  and corresponding feature sum/count of the reference sequence
               imps_unp_windxfval: list of tuples each comprising two numpy array elements representing 
                                   the weight indices and corresponding feature sum/count of the imposter sequences
               prob_vec: numpy vector representing the probability of each imposter sequence
        """
        gamma = self.training_description['gamma']
        # update weights using the decoded sequences
        for i in range(len(imps_unp_windxfval)):
            windx, fval = imps_unp_windxfval[i]
            w[windx] -= (gamma*prob_vec[i]) * fval
        # update weights using the reference sequence   
        windx, fval = ref_unp_windxfval
        w[windx] += gamma * fval

    def _compute_probvec_sapo(self, w, imps_unp_windxfval):
        """compute the probabilty of each imposter sequence in the SAPO algorithm
        
           Args:
               w: weight vector (numpy vector)
               imps_unp_windxfval: list of dictionaries (unpacked) representing the weight indices and corresponding feature sum/count
                                   of the imposter sequences
        """
        # normalize
        num_imposters = len(imps_unp_windxfval)
        ll_vec = numpy.zeros(num_imposters)
        for i in range(num_imposters):
            windx, fval = imps_unp_windxfval[i]
            ll_vec[i] = numpy.dot(w[windx], fval)
        Z = vectorized_logsumexp(ll_vec)
        prob_vec = numpy.exp(ll_vec - Z)
#         print("prob_vec ", prob_vec)
        return(prob_vec)
    
    def _sapo(self, w, train_seqs_id):
        """implements Search-based Probabilistic Online Learning Algorithm (SAPO)

           this implementation adapts it to 'violation-fixing' framework (i.e. inexact search is supported)
          
           .. see:: 
           
              original paper at https://arxiv.org/pdf/1503.08381v1.pdf
                       
           .. note:: 
               
              the regularization is based on averaging rather than l2 as it seems to be consistent during training
              while using exact or inexact search
        """
        self._report_training()
        num_epochs = self.training_description["num_epochs"]
#         regularization_type = self.training_description["regularization_type"]
        # regularization parameter lambda
#         C = self.training_description['regularization_value']
#         gamma = self.training_description['gamma']
        shuffle_seq = self.training_description['shuffle_seq']
        model_dir = self.training_description["model_dir"]
        log_file = os.path.join(model_dir, "crf_training_log.txt")

        N = len(train_seqs_id)
        crf_model = self.crf_model
        # instance variable to keep track of elapsed time between optimization iterations
        self._elapsed_time = datetime.now()
        self._exitloop = False
        
        avg_error_list = [0]
        w_avg = numpy.zeros(len(w), dtype='longdouble')

        for k in range(num_epochs):
            seq_left = N
            error_count = 0
            if(shuffle_seq):
                numpy.random.shuffle(train_seqs_id)
            for seq_id in train_seqs_id:
                ref_unp_windxfval, imps_unp_windxfval, seq_err_count = self._find_update_violation(w, seq_id)
                prob_vec = self._compute_probvec_sapo(w, imps_unp_windxfval)
                self._update_weights_sapo(w, ref_unp_windxfval, imps_unp_windxfval, prob_vec)
                # regularize the weights 
#                 reg = -(C/N)* w
#                 w += gamma*reg
                w_avg += w
                crf_model.clear_cached_info([seq_id])
                seq_left -= 1
                #print('seq_err_count ', seq_err_count)
                if(seq_err_count):
                    error_count += seq_err_count
#                 print("error count {}".format(error_count))
                print("sequences left {}".format(seq_left))
            avg_error_list.append(float(error_count/N))
            self._track_perceptron_optimizer(w, k, avg_error_list)
            ReaderWriter.dump_data(w_avg/((k+1)*N), os.path.join(model_dir, "model_avgweights_epoch_{}".format(k+1)))
            print("average error : {}".format(avg_error_list[1:]))
#             print("self._exitloop {}".format(self._exitloop))
            if(self._exitloop):
                break
            self._elapsed_time = datetime.now()
            
        line = "---Model training--- end time {} \n".format(datetime.now())
        ReaderWriter.log_progress(line, log_file)
        w = w_avg/(num_epochs*N) 
        ReaderWriter.dump_data(avg_error_list, os.path.join(model_dir, 'avg_decodingerror_training'))

        return(w)      
    
    def _update_weights_perceptron(self, w, ref_unp_windxfval, imp_unp_windxfval):
        """update weight vector for the COLLINS-PERCEPTRON method
        
           Args:
               w: weight vector (numpy vector)
               ref_unp_windxfval: dictionary (unpacked) representing the weight indices and corresponding feature sum/count
                                  of the reference sequence
               imps_unp_windxfval: list of dictionaries (unpacked) representing the weight indices and corresponding feature sum/count
                                   of the imposter sequences
        """
        ref_windx, ref_fval = ref_unp_windxfval
        imp_windx, imp_fval = imp_unp_windxfval
        w[ref_windx] += ref_fval
        w[imp_windx] -= imp_fval
        
    def _structured_perceptron(self, w, train_seqs_id):
        """implements structured perceptron algorithm in particular the average perceptron 
            
           it was introduced by Michael Collins in 2002 (see his paper http://www.aclweb.org/anthology/W02-1001)
           this implementation supports different averaging schemes for the weight learning

           Args:
               w: weight vector (numpy vector)
               seqs_id: list of integers representing ids assigned to the sequence
        """
        self._report_training()
        num_epochs = self.training_description["num_epochs"]
        avg_scheme = self.training_description["avg_scheme"]
        shuffle_seq = self.training_description["shuffle_seq"]
        model_dir = self.training_description["model_dir"]
        log_file = os.path.join(model_dir, "crf_training_log.txt")

        N = len(train_seqs_id)
        crf_model = self.crf_model
        # instance variable to keep track of elapsed time between optimization iterations
        self._elapsed_time = datetime.now()
        self._exitloop = False
        
        if(avg_scheme in {"avg_error", "avg_uniform"}):
            # accumulated sum of estimated weights
            w_avg = numpy.zeros(len(w), dtype = "longdouble")
            avg_error_list = [0]
            num_upd = 0
            for k in range(num_epochs):
                seq_left = N
                error_count = 0
                if(shuffle_seq):
                    numpy.random.shuffle(train_seqs_id)
                for seq_id in train_seqs_id:
                    print("sequences left {}".format(seq_left))
                    ref_unp_windxfval, imps_unp_windxfval, seq_err_count = self._find_update_violation(w, seq_id)
                    # if decoding errors with the current weight occurs
                    #^print("seq_err_count ", seq_err_count)
                    #^print("y_ref_windxfval ", y_ref_windxfval)
                    if(seq_err_count):
                        error_count += seq_err_count
                        if(avg_scheme == "avg_error"):
                            # consider/emphasize more on previous weights that have small average error decoding per sequence
                            w_avg += (1-seq_err_count) * w
                            num_upd += (1-seq_err_count)
                        else:
                            w_avg += w
                            num_upd += 1
                        # update current weight
                        self._update_weights_perceptron(w, ref_unp_windxfval, imps_unp_windxfval[0])
                    crf_model.clear_cached_info([seq_id])
                    seq_left -= 1
#                 print("error count {}".format(error_count))
                avg_error_list.append(float(error_count/N))
                self._track_perceptron_optimizer(w, k, avg_error_list)
                if(num_upd):
                    w_dump = w_avg/num_upd
                else:
                    w_dump = w_avg
                ReaderWriter.dump_data(w_dump, os.path.join(model_dir, "model_avgweights_epoch_{}".format(k+1)))
                print("average error : {}".format(avg_error_list[1:]))
#                 print("self._exitloop {}".format(self._exitloop))
                if(self._exitloop):
                    break
                self._elapsed_time = datetime.now()
            if(num_upd):
                w = w_avg/num_upd
            
        line = "---Model training--- end time {} \n".format(datetime.now())
        ReaderWriter.log_progress(line, log_file)
        ReaderWriter.dump_data(avg_error_list, os.path.join(model_dir, 'avg_decodingerror_training'))

        return(w)      

    def _track_perceptron_optimizer(self, w, k, avg_error_list):
        """track search based optimized (such as SAPO and COLLINS-PERCEPTRON) by logging each iteration 
        
           Args:
               w: weight vector (numpy vector)
               k: current epoch
               avg_error_list: list of the decoding errors in each previous epochs

        """
        delta_time = datetime.now() - self._elapsed_time 
        self._check_reldiff(avg_error_list[-2], avg_error_list[-1])
        model_dir = self.training_description["model_dir"]
        log_file = os.path.join(model_dir, "crf_training_log.txt")
        line = "--- Iteration {} --- \n".format(k+1)
        line += "Average percentage of decoding error: {} \n".format(avg_error_list[-1]*100)
        line += "Number of seconds spent: {} \n".format(delta_time.total_seconds())
        ReaderWriter.log_progress(line, log_file)
        # dump the learned weights for every pass
        ReaderWriter.dump_data(w, os.path.join(model_dir, "model_weights_epoch_{}".format(k+1)))

        
    def _sga_adadelta(self, w, train_seqs_id):
        """implements stochastic gradient ascent using adaptive approach of ADADELTA 
            
           the original paper is found in https://arxiv.org/abs/1212.5701

           Args:
               w: weight vector (numpy vector)
               train_seqs_id: list of integers representing ids assigned to the sequence
        """
        self._report_training()
        crf_model = self.crf_model
        num_epochs = self.training_description["num_epochs"]
        regularization_type = self.training_description["regularization_type"]
        # regularization parameter lambda
        C = self.training_description['regularization_value']
        # number of training sequences
        N = len(train_seqs_id)
         
        model_dir = self.training_description["model_dir"]
        log_file = os.path.join(model_dir, "crf_training_log.txt")

        # keeps track of the log-likelihood of a sequence before weight updating
        seqs_loglikelihood_vec = numpy.zeros(N)
        seqs_id_mapper = {seq_id:unique_id for unique_id, seq_id in enumerate(train_seqs_id)}
        # step size decides the number of data points to average in the seqs_loglikelihood_vec
        # using 10% of data points
        step_size = round(N * 0.1)
        if step_size == 0:
            step_size = 1
        mean_cost_vec = [0]
        
        p_rho = self.training_description["p_rho"]
        epsilon = self.training_description["epsilon"]
        E_g2 = numpy.zeros(len(w), dtype="longdouble")
        E_deltaw2 = numpy.zeros(len(w), dtype="longdouble")
        if(regularization_type == "l1"):
            u = 0
            q = numpy.zeros(len(w), dtype = "longdouble")
        # gradient
        grad = numpy.zeros(len(w), dtype = "longdouble")      
        # instance variable to keep track of elapsed time between optimization iterations
        self._elapsed_time = datetime.now()
        self._exitloop = False
        for k in range(num_epochs):
            # shuffle sequences at the beginning of each epoch 
            numpy.random.shuffle(train_seqs_id)
            numseqs_left = N
            print("k ",k)
            for seq_id in train_seqs_id:
#                     print(seq_id)
                    
#                 print("first seqs_info[{}]={}".format(seq_id, crf_model.seqs_info[seq_id]))
                seq_loglikelihood = crf_model.compute_seq_loglikelihood(w, seq_id)
                seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood
                target_indx = crf_model.compute_seq_gradient(w, seq_id, grad)                
                if(C):
                    if(regularization_type == 'l2'):
                        seq_loglikelihood += - ((C/N) * (1/2) * numpy.dot(w, w))
                        grad -= ((C/N)* w)
                        
                    elif(regularization_type == 'l1'):
                        seq_loglikelihood += - (C/N) * numpy.sum(numpy.abs(w))
                        
                    # update the computed sequence loglikelihood by adding the regularization term contribution   
                    seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood

                    # accumulate gradient
                    E_g2 = p_rho * E_g2 + (1-p_rho) * numpy.square(grad) 
                    RMS_g = numpy.sqrt(E_g2 + epsilon)
                    RMS_deltaw = numpy.sqrt(E_deltaw2 + epsilon)
                    ratio = (RMS_deltaw/RMS_g)
                    deltaw =  ratio * grad
                    E_deltaw2 = p_rho * E_deltaw2 + (1-p_rho) * numpy.square(deltaw)                    
                    w += deltaw
                    if(regularization_type == "l1"):
                        u += ratio * (C/N)
                        w_upd, q_upd = self._apply_l1_penalty(w, q, u, target_indx)
                        w = w_upd
                        q = q_upd
                else:

                    # accumulate gradient
                    fval = grad[target_indx]
                    E_g2 = p_rho * E_g2
                    E_g2[target_indx] += (1-p_rho) * numpy.square(fval)
                    RMS_g = numpy.sqrt(E_g2 + epsilon)
                    RMS_deltaw = numpy.sqrt(E_deltaw2 + epsilon)
                    ratio = (RMS_deltaw/RMS_g)
                    deltaw = ratio[target_indx] * fval
                    E_deltaw2 = p_rho * E_deltaw2 
                    E_deltaw2[target_indx] += (1-p_rho) * numpy.square(deltaw)                    
                    w[target_indx] += deltaw
                
#                 print("second seqs_info[{}]={}".format(seq_id, crf_model.seqs_info[seq_id]))
                # clean cached info
                crf_model.clear_cached_info([seq_id])
                numseqs_left -= 1
#                 print("third seqs_info[{}]={}".format(seq_id, crf_model.seqs_info[seq_id]))
                # reset the gradient
                grad.fill(0)
                print("num seqs left: {}".format(numseqs_left))
            
            seqs_cost_vec = [numpy.mean(seqs_loglikelihood_vec[i:i+step_size]) for i in range(0, N, step_size)]
            # to consider plotting this vector
            mean_cost_vec.append(numpy.mean(seqs_loglikelihood_vec))
            self._track_sga_optimizer(w, seqs_cost_vec, mean_cost_vec, k)
            if(self._exitloop):
                break
            self._elapsed_time = datetime.now()

            
        line = "---Model training--- end time {} \n".format(datetime.now())
        ReaderWriter.log_progress(line, log_file)
        ReaderWriter.dump_data(mean_cost_vec, os.path.join(model_dir, 'avg_loglikelihood_training'))

        return(w)  
    
    def _sga_classic(self, w, train_seqs_id):
        """implements stochastic gradient ascent
            
           Args:
               w: weight vector (numpy vector)
               train_seqs_id: list of integers representing ids assigned to the sequence
        """
        self._report_training()
        crf_model = self.crf_model
        num_epochs = self.training_description["num_epochs"]
        regularization_type = self.training_description["regularization_type"]
        # regularization parameter lambda
        C = self.training_description['regularization_value']
        # number of training sequences
        N = len(train_seqs_id)
         
        model_dir = self.training_description["model_dir"]
        log_file = os.path.join(model_dir, "crf_training_log.txt")

        # keeps track of the log-likelihood of a sequence before weight updating
        seqs_loglikelihood_vec = numpy.zeros(N)
        seqs_id_mapper = {seq_id:unique_id for unique_id, seq_id in enumerate(train_seqs_id)}
        # step size decides the number of data points to average in the seqs_loglikelihood_vec
        # using 10% of data points
        step_size = round(N * 0.1)
        if step_size == 0:
            step_size = 1
        mean_cost_vec = [0]
        
        # instance variable to keep track of elapsed time between optimization iterations
        self._elapsed_time = datetime.now()
        self._exitloop = False
        
        if(regularization_type == "l1"):
            u = 0
            q = numpy.zeros(len(w), dtype = "longdouble")
               
        learning_rate_schedule = self.training_description["learning_rate_schedule"]
        t0 = self.training_description["t0"]
        # 0<a<1 -- a parameter should be between 0 and 1 exclusively
        a = self.training_description["a"]
        t = 0
        # gradient
        grad = numpy.zeros(len(w), dtype = "longdouble")
        for k in range(num_epochs):
            # shuffle sequences at the beginning of each epoch 
            numpy.random.shuffle(train_seqs_id)
            numseqs_left = N
            
            for seq_id in train_seqs_id:
                # compute/update learning rate
                if(learning_rate_schedule == "bottu"):
                    eta = C/(t0 + t)
                elif(learning_rate_schedule == "exponential_decay"):
                    eta = t0*a**(t/N)
                elif(learning_rate_schedule == "t_inverse"):
                    eta = t0/(1 + a*(t/N))
                elif(learning_rate_schedule == "constant"):
                    eta = t0
                    
#                 print("eta {}".format(eta))
#                 print(seq_id)
                
                seq_loglikelihood = crf_model.compute_seq_loglikelihood(w, seq_id)
                seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood
                target_index = crf_model.compute_seq_gradient(w, seq_id, grad)
#                 print("seq_grad {}".format(seq_grad))
                if(C):
                    if(regularization_type == 'l2'):
                        seq_loglikelihood += - ((C/N) * (1/2) * numpy.dot(w, w))
                        grad -= ((C/N)* w)
                        w += eta * grad
                        
                    elif(regularization_type == 'l1'):
                        seq_loglikelihood += - (C/N) * numpy.sum(numpy.abs(w))
                        u += eta * (C/N)
                        w_upd, q_upd = self._apply_l1_penalty(w, q, u, target_index)
                        w = w_upd
                        q = q_upd
                        
                    # update the computed sequence loglikelihood by adding the regularization term contribution   
                    seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood

                else:                   
#                     print("fval {}".format(fval)) 
                    w[target_index] += eta * grad[target_index]
                
                t += 1
                # clean cached info
                crf_model.clear_cached_info([seq_id])
                # reset the gradient
                grad.fill(0)
                numseqs_left -= 1
                print("num seqs left: {}".format(numseqs_left))
                
            seqs_cost_vec = [numpy.mean(seqs_loglikelihood_vec[i:i+step_size]) for i in range(0, N, step_size)]
            # to consider plotting this vector
            mean_cost_vec.append(numpy.mean(seqs_loglikelihood_vec))
            self._track_sga_optimizer(w, seqs_cost_vec, mean_cost_vec, k)
            if(self._exitloop):
                break
            self._elapsed_time = datetime.now()

            
        line = "---Model training--- end time {} \n".format(datetime.now())
        ReaderWriter.log_progress(line, log_file)
        ReaderWriter.dump_data(mean_cost_vec, os.path.join(model_dir, 'avg_loglikelihood_training'))
        return(w)

    def _sga_svrg(self, w, train_seqs_id):
        """implements the stochastic variance reduced gradient
        
           The algorithm is reported in  `Johnson R, Zhang T. Accelerating Stochastic Gradient Descent using  Predictive Variance Reduction.
           <https://papers.nips.cc/paper/4937-accelerating-stochastic-gradient-descent-using-predictive-variance-reduction.pdf>`__
           
           Args:
               w: weight vector (numpy vector)
               train_seqs_id: list of integers representing sequences IDs
           
        """
        # keep the original number of epochs requested
        num_epochs = self.training_description["num_epochs"]
        # run stochastic gradient ascent to initialize the weights
        self.training_description["num_epochs"] = 1
        # current snapshot of w (i.e. w tilda)
        w_tilda_c = self._sga_classic(w, train_seqs_id)
        self.cleanup()

        self.training_description["num_epochs"] = num_epochs
        crf_model = self.crf_model
        regularization_type = self.training_description["regularization_type"]
        # regularization parameter lambda
        C = self.training_description['regularization_value']
        # number of training sequences
        N = len(train_seqs_id)
         
        model_dir = self.training_description["model_dir"]
        log_file = os.path.join(model_dir, "crf_training_log.txt")

        # keeps track of the log-likelihood of a sequence before weight updating
        seqs_loglikelihood_vec = numpy.zeros(N)
        seqs_id_mapper = {seq_id:unique_id for unique_id, seq_id in enumerate(train_seqs_id)}
        # step size decides the number of data points to average in the seqs_loglikelihood_vec
        # using 10% of data points
        step_size = round(N * 0.1)
        if step_size == 0:
            step_size = 1
        mean_cost_vec = [0]
        
        if(regularization_type == "l1"):
            u = 0
            q = numpy.zeros(len(w), dtype = "longdouble")
               
        eta = self.training_description["t0"]

        m = 2*N
        saved_grad = {}
        # gradient
        grad = numpy.zeros(len(w), dtype = "longdouble")    
        # instance variable to keep track of elapsed time between optimization iterations
        self._elapsed_time = datetime.now()
        self._exitloop = False
        
        for s in range(num_epochs):
            print("stage {}".format(s))
            
            # ###################################
            # compute the average gradient using the snapshot of w (i.e. w tilda)
            mu_grad = numpy.zeros(len(w_tilda_c), dtype = "longdouble")
            # compute average gradient
            seqs_left = N
            for seq_id in train_seqs_id:
                target_indx = crf_model.compute_seq_gradient(w_tilda_c, seq_id, grad)
                fval = grad[target_indx]
                mu_grad[target_indx] += fval
                crf_model.clear_cached_info([seq_id])
                saved_grad[seq_id] = (target_indx, fval)
                # reset grad
                grad.fill(0)
                seqs_left -= 1
                print("average gradient phase: {} seqs left".format(seqs_left))
            mu_grad /= N
            #######################################
                
            w = numpy.copy(w_tilda_c) 
                
            for t in range(m):
                seq_id = numpy.random.choice(train_seqs_id, 1)[0]
                print("round {} out of {}".format(t+1, m))
                
                seq_loglikelihood = crf_model.compute_seq_loglikelihood(w, seq_id)
                seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood
                target_indx = crf_model.compute_seq_gradient(w, seq_id, grad)
                fval = grad[target_indx]
                if(C):
                    if(regularization_type == 'l2'):
                        seq_loglikelihood += - ((C/N) * (1/2) * numpy.dot(w, w))
                        grad -= ((C/N)* w)
                        
                        grad[saved_grad[seq_id][0]] -= saved_grad[seq_id][1] 
                        grad += mu_grad
                        
                        w += eta * grad
                        
                    elif(regularization_type == 'l1'):
                        seq_loglikelihood += - (C/N) * numpy.sum(numpy.abs(w))
                        u += eta * (C/N)
                        grad[saved_grad[seq_id][0]] -= saved_grad[seq_id][1] 
                        grad +=  mu_grad
                        w_upd, q_upd = self._apply_l1_penalty(w, q, u, target_indx)
                        w = w_upd
                        q = q_upd
                        
                    # update the computed sequence loglikelihood by adding the regularization term contribution   
                    seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood

                else:                    
                    w[target_indx] += eta * (fval - saved_grad[seq_id][1])
                    w += eta * mu_grad
                t += 1
                # clean cached info
                crf_model.clear_cached_info([seq_id])
                grad.fill(0)
            w_tilda_c = w
                
            seqs_cost_vec = [numpy.mean(seqs_loglikelihood_vec[i:i+step_size]) for i in range(0, N, step_size)]
            # to consider plotting this vector
            mean_cost_vec.append(numpy.mean(seqs_loglikelihood_vec))
            self._track_sga_optimizer(w, seqs_cost_vec, mean_cost_vec, s)
            if(self._exitloop):
                break
            self._elapsed_time = datetime.now()

            
        line = "---Model training--- end time {} \n".format(datetime.now())
        ReaderWriter.log_progress(line, log_file)
        ReaderWriter.dump_data(mean_cost_vec, os.path.join(model_dir, 'avg_loglikelihood_training'))
        return(w)     
        
    def _apply_l1_penalty(self, w, q, u, w_indx):
        """apply l1 regularization to the weights 
        
           it uses the approach of Tsuruoka et al. Stochastic gradient descent training for L1-regularized log-linear models with cumulative penalty
           
           Args:
               w: weight vector (numpy vector)
               q: total L1 penalty that current weights (corresponding to the features) did receive up to the current time
               u: absolute value of total L1 penalty that each weight could receive up to the current time
               w_indx: weight indices corresponding to the current features under update
               
           TODO: vectorize this function
        """
        for indx in w_indx:
            z = w[indx]
#             print("z is {}".format(z))
#             print("q[indx] is {}".format(q[indx]))
            if(w[indx] > 0):
#                 print("we want the max between 0 and {}".format(w[indx] - (u + q[indx])))
                w[indx] = numpy.max([0, w[indx] - (u + q[indx])])
            elif(w[indx] < 0):
#                 print("we want the min between 0 and {}".format(w[indx] + (u - q[indx])))
                w[indx] = numpy.min([0, w[indx] + (u - q[indx])])
#             print("z is {}".format(z))
#             print("w[indx] is {}".format(w[indx]))
            q[indx] = q[indx] + (w[indx] - z)
        return((w, q))
#             print("q[indx] becomes {}".format(q[indx]))

    def _track_sga_optimizer(self, w, seqs_loglikelihood, mean_loglikelihood, k):
        """track stochastic gradient ascent optimizers by logging each iteration 
        
           Args:
               w: weight vector (numpy vector)
               seqs_loglikelihood: numpy vector representing the average loglikelihood of batches of sequences
               mean_loglikelihood: mean of the seqs_loglikelihood vector
               k: current epoch

        """
        delta_time = datetime.now() - self._elapsed_time 
        self._check_reldiff(mean_loglikelihood[-2], mean_loglikelihood[-1])
        epoch_num = k
        # log file 
        model_dir = self.training_description["model_dir"]
        log_file = os.path.join(model_dir, "crf_training_log.txt")
        line = "--- Epoch/pass {} --- \n".format(epoch_num+1)
        line += "Estimated training cost (average loglikelihood) is {} \n".format(mean_loglikelihood[-1])
        line += "Number of seconds spent: {} \n".format(delta_time.total_seconds())
        ReaderWriter.log_progress(line, log_file)
    
[docs]    def cleanup(self):
        """End of training -- cleanup"""
        # reset iteration counter
        self._iter_count = None
        # reset elapsed time between iterations
        self._elapsed_time = None
        self._exitloop = None


[docs]class SeqDecodingEvaluator(object):
    """Evaluator class to evaluate performance of the models
    
       Args:
           model_repr: the CRF model representation that has a suffix of `ModelRepresentation`
                       such as :class:`HOCRFADModelRepresentation`
       
       Attributes:
           model_repr: the CRF model representation that has a suffix of `ModelRepresentation`
                       such as :class:`HOCRFADModelRepresentation`
                       
       .. note::
       
          this class does not support evaluation of segment learning (i.e. notations that include IOB2/BIO notation)
    """
    def __init__(self, model_repr):
        self.model_repr = model_repr
                    
[docs]    def compute_states_confmatrix(self, Y_seqs_dict):
        """compute/generate the confusion matrix for each state
           
           Args:
               Y_seqs_dict: dictionary where each sequence has the reference label sequence
                            and its corresponding predicted sequence. It has the following form
                            ``{seq_id:{'Y_ref':[reference_ylabels], 'Y_pred':[predicted_ylabels]}}``
        """
        Y_codebook = self.model_repr.Y_codebook
        M = len(Y_codebook)
        # add another state in case unseen states occur in the test data
        self.model_confusion_matrix = numpy.zeros((M+1, M+1), dtype="float")
        for seq_id in Y_seqs_dict:
            Y_pred = Y_seqs_dict[seq_id]['Y_pred']
            Y_ref = Y_seqs_dict[seq_id]['Y_ref']
            self._compute_model_confusionmatrix(self.map_states_to_num(Y_ref, Y_codebook, M),
                                                self.map_states_to_num(Y_pred, Y_codebook, M)
                                                )
        statelevel_confmatrix  = self._generate_statelevel_confusion_matrix()
        
        return(statelevel_confmatrix)
    
    def _generate_statelevel_confusion_matrix(self):
        model_confusion_matrix = self.model_confusion_matrix
        num_states = model_confusion_matrix.shape[0]
        total = model_confusion_matrix.sum()
        statelevel_confmatrix = numpy.zeros((num_states, 2, 2), dtype='float')
        for i in range(num_states):
            tp = model_confusion_matrix[i, i]
            fp = model_confusion_matrix[i, :].sum() - tp
            fn = model_confusion_matrix[:, i].sum() - tp
            tn = total - (tp+fp+fn)
            statelevel_confmatrix[i] = numpy.array([[tp, fn], [fp, tn]])
        return(statelevel_confmatrix)
    
[docs]    def get_performance_metric(self, taglevel_performance, metric, exclude_states=[]):
        """compute the performance of the model using a requested metric
           
           Args:
               taglevel_performance: `numpy` array with Mx2x2 dimension. For every state code a 2x2 confusion matrix
                                     is included. It is computed using :func:`compute_model_performance`
               metric: evaluation metric that could take one of ``{'f1', 'precision', 'recall', 'accuracy'}``
           
           Keyword Arguments:
               exclude_states: list (default empty list) of states to exclude from the computation. Usually, in NER applications the non-entity symbol
                               such as 'O' is excluded from the computation. Example: If ``exclude_states = ['O']``, this will replicate the behavior of `conlleval script <http://www.cnts.ua.ac.be/conll2000/chunking/output.html>`__
        """
        Y_codebook = self.model_repr.Y_codebook
        # do not include 'exclude states' in the computation
        exclude_indices = [Y_codebook[state] for state in exclude_states]
        # total number of states plus 1
        M = len(Y_codebook) + 1
        include_indices = list(set(range(M)) - set(exclude_indices))
        # perform sum across all layers to get micro-average
        collapsed_performance = taglevel_performance[include_indices].sum(axis = 0)
#         print("collapsed performance \n {}".format(collapsed_performance))
        tp = collapsed_performance[0,0]
        fp = collapsed_performance[1,0]
        fn = collapsed_performance[0,1]
        tn = collapsed_performance[1,1]
                    
        perf_measure = 0

        try:
            if(metric == "f1"):
                precision = tp/(tp + fp)
                recall = tp/(tp + fn)
                f1 = (2 * precision * recall)/(precision +  recall)
                print("f1 {}".format(f1))
                perf_measure = f1
            elif(metric == "precision"):
                precision = tp/(tp + fp)
                print("precision {}".format(precision))
                perf_measure = precision
            elif(metric == "recall"):
                recall = tp/(tp + fn)
                print("recall {}".format(recall))
                perf_measure = recall
            elif(metric == "accuracy"):
                accuracy = (tp + tn)/(tp + fp + fn + tn)
                print("accuracy {}".format(accuracy))
                perf_measure = accuracy
        except ZeroDivisionError as e:
            print("dividing by Zero: check/investigate the confusion matrix")
        finally:
            return(perf_measure)
    
[docs]    def map_states_to_num(self, Y, Y_codebook, M):
        """map states to their code/number using the `Y_codebook`
           
           Args:
               Y: list representing label sequence 
               Y_codebook: dictionary containing the states as keys and the assigned unique code as values
               M: number of states
               
           .. note:: we give one unique index for tags that did not occur in the training data such as len(Y_codebook)

        """
        Y_coded = [Y_codebook[state] if state in Y_codebook else M for state in Y]
#         print("Y_coded {}".format(Y_coded))
        return(Y_coded)
        


    def _compute_model_confusionmatrix(self, Y_ref, Y_pred):
        """compute confusion matrix on the level of the tag/state
        
           Args:
               Y_ref: list of reference label sequence (represented by the states code)
               Y_pred: list of predicted label sequence (represented by the states code) 
        """
        Y_ref = numpy.asarray(Y_ref)
        Y_pred = numpy.asarray(Y_pred)
        model_confusion_matrix = self.model_confusion_matrix
        for i in range(len(Y_ref)):
            ref_state = Y_ref[i]
            pred_state = Y_pred[i]
            model_confusion_matrix[ref_state, pred_state] += 1
    
[docs]class Evaluator(object):
    """Evaluator class to evaluate performance of the models 
    
       Args:
           model_repr: the CRF model representation that has a suffix of `ModelRepresentation`
                       such as :class:`HOCRFADModelRepresentation`
       
       Attributes:
           model_repr: the CRF model representation that has a suffix of `ModelRepresentation`
                       such as :class:`HOCRFADModelRepresentation`
                       
       .. note::
       
          this class is **EXPERIMENTAL/work in progress*** and does not support evaluation of segment learning.
          Use instead :class:`SeqDecodingEvaluator` for evaluating models learned using **sequence** learning.
    """
    def __init__(self, model_repr):
        self.model_repr = model_repr
        
[docs]    def transform_codebook(self, Y_codebook, prefixes):
        """map states coded in BIO notation to their original states value
        
           Args:
               Y_codebook: dictionary of states each assigned a unique integer
               prefixes: tuple of prefix notation used such as ("B-","I-") for BIO 
        """
        state_mapper = {}
        for state in Y_codebook:
            if(state != "O"):
                for prefix in prefixes:
                    elems = state.split(prefix)
                    if(len(elems)>1):
                        new_state = elems[-1]
                        state_mapper[state] = new_state
                        break
            else:
                state_mapper[state] = state
        return(state_mapper)
                    
[docs]    def compute_model_performance(self, Y_seqs_dict, metric, output_file, states_notation):
        r"""compute the performance of the model
           
           Args:
               Y_seqs_dict: dictionary where each sequence has the reference label sequence
                            and its corresponding predicted sequence. It has the following form
                            ``{seq_id:{'Y_ref':[reference_ylabels], 'Y_pred':[predicted_ylabels]}}``
               metric: evaluation metric that could take one of {'f1', 'precision', 'recall', 'accuracy'}
               output_file: file where to output the evaluation result
               states_notation: notation used to code the state (i.e. BIO)
          
        """
        Y_codebook = self.model_repr.Y_codebook

        if(states_notation == "BIO"):
            prefixes = ("B-", "I-")
            state_mapper = self.transform_codebook(Y_codebook, prefixes)
            transformed_codebook = {}
            counter = 0
            for new_state in state_mapper.values():
                if(new_state not in transformed_codebook):
                    transformed_codebook[new_state] = counter
                    counter += 1
        else:
            state_mapper = {state:state for state in Y_codebook}
            transformed_codebook = Y_codebook
        
        transformed_codebook_rev = {code:state for state, code in transformed_codebook.items()}
        #^print("original Y_codebook ", Y_codebook)
        #^print("state_mapper ", state_mapper)
        #^print("transformed_codebook ", transformed_codebook)
        M = len(transformed_codebook)
        # add another state in case unseen states occur in the test data
        model_taglevel_performance = numpy.zeros((M + 1, 2, 2))

        for seq_id in Y_seqs_dict:
            Y_pred = Y_seqs_dict[seq_id]['Y_pred']
            Y_ref = Y_seqs_dict[seq_id]['Y_ref']
            #^print("Y_pred ", Y_pred)
            #^print("Y_ref ", Y_ref)
            taglevel_performance = self.compute_tags_confusionmatrix(self.map_states_to_num(Y_ref, state_mapper, transformed_codebook, M),
                                                                     self.map_states_to_num(Y_pred, state_mapper,transformed_codebook, M),
                                                                     transformed_codebook_rev,
                                                                     M)
#             print("taglevel_performance {}".format(taglevel_performance))
#             print("tagging performance \n {}".format(taglevel_performance))
            model_taglevel_performance += taglevel_performance
            #^print("model_taglevel_performance ", model_taglevel_performance)

        # perform sum across all layers to get micro-average
        collapsed_performance = model_taglevel_performance.sum(axis = 0)
#         print("collapsed performance \n {}".format(collapsed_performance))
        tp = collapsed_performance[0,0]
        fp = collapsed_performance[0,1]
        fn = collapsed_performance[1,0]
        tn = collapsed_performance[1,1]
        
        perf_measure = 0
        if(metric == "f1"):
            precision = tp/(tp + fp)
            recall = tp/(tp + fn)
            f1 = 2 * ((precision * recall)/(precision +  recall))
            print("f1 {}".format(f1))
            perf_measure = f1
        elif(metric == "precision"):
            precision = tp/(tp + fp)
            print("precision {}".format(precision))
            perf_measure = precision
        elif(metric == "recall"):
            recall = tp/(tp + fn)
            print("recall {}".format(recall))
            perf_measure = recall
        elif(metric == "accuracy"):
            accuracy = (tp + tn)/(tp + fp + fn + tn)
            print("accuracy {}".format(accuracy))
            perf_measure = accuracy
        
        with open(output_file, mode = 'w') as f:
            f.write("The performance of the model based on the {} measure is {}\n".format(metric, perf_measure))
            f.write('Confusion matrix: tp:{} fp:{} fn:{} tn:{}\n'.format(tp, fp, fn, tn))
        return(perf_measure)
        
[docs]    def map_states_to_num(self, Y, state_mapper, transformed_codebook, M):
        """map states to their code/number using the `Y_codebook`
           
           Args:
               Y: list representing label sequence 
               state_mapper: mapper between the old and new generated states generated from :func:`tranform_codebook` method
               trasformed_codebook: the transformed codebook of the new identified states
               M: number of states
               
           .. note:: we give one unique index for tags that did not occur in the training data such as len(Y_codebook)

        """
#         Y_coded = []
#         for state in Y:
#             mapped_state = state_mapper[state]
#             if(mapped_state in transformed_codebook):
#                 Y_coded.append(transformed_codebook[mapped_state])
#             else:
#                 Y_coded.append(M)
        Y_coded = [transformed_codebook[state_mapper[state]] if state_mapper.get(state) in transformed_codebook else M for state in Y]
#         print("Y_coded {}".format(Y_coded))
        return(Y_coded)
        
[docs]    def compute_tags_confusionmatrix(self, Y_ref, Y_pred, transformed_codebook_rev, M):
        """compute confusion matrix on the level of the tag/state
        
           Args:
               Y_ref: list of reference label sequence (represented by the states code)
               Y_pred: list of predicted label sequence (represented by the states code) 
               transformed_codebook: the transformed codebook of the new identified states
               M: number of states
        """
        #^print("Y_ref coded ", Y_ref)
        #^print("Y_pred coded ", Y_pred)
        detected_statescode = set(Y_ref)
        Y_ref = numpy.asarray(Y_ref)
        Y_pred = numpy.asarray(Y_pred)
#         print("Y_ref as numpy array {}".format(Y_ref))
        tagslevel_performance = numpy.zeros((M + 1, 2, 2))
        
        for statecode in detected_statescode:
            # get all indices of the target tag (gold-standard)
            tag_indx_origin = numpy.where(Y_ref == statecode)[0]
            # get all indices of the target tag (predicted)
            tag_indx_pred = numpy.where(Y_pred == statecode)[0]
            tag_tp = len(numpy.where(numpy.in1d(tag_indx_origin, tag_indx_pred))[0])
            tag_fn = len(tag_indx_origin) - tag_tp
            other_indx_origin = numpy.where(Y_ref != statecode)[0]
            tag_fp = len(numpy.where(numpy.in1d(other_indx_origin, tag_indx_pred))[0])
            tag_tn = len(other_indx_origin) - tag_fp
            tagslevel_performance[statecode] = numpy.array([[tag_tp, tag_fp], [tag_fn, tag_tn]])
            
        return(tagslevel_performance)
    
if __name__ == "__main__":
    pass