Source code for pyseqlab.crf_learning

@author: Ahmed Allam <>

import os
from datetime import datetime
import numpy
from .utilities import ReaderWriter, create_directory, generate_datetime_str, vectorized_logsumexp

[docs]class Learner(object): """learner used for training CRF models supporting search- and gradient-based learning methods Args: crf_model: an instance of CRF models such as :class:`HOCRFAD` Keyword Arguments: crf_model: an instance of CRF models such as :class:`HOCRFAD` training_description: dictionary that will include the training specification of the model """ def __init__(self, crf_model): self.crf_model = crf_model self.training_description = None
[docs] def train_model(self, w0, seqs_id, optimization_options, working_dir, save_model = True): r"""the **MAIN** method for training models using the various options available Args: w0: numpy vector representing initial weights for the parameters seqs_id: list of integers representing the sequence ids optimization_options: dictionary specifying the training method working_dir: string representing the directory where the model data and generated files will be saved Keyword Arguments: save_model: boolean specifying if to save the final model Example: The available options for training are: - `SGA` for stochastic gradient ascent - `SGA-ADADELTA` for stochastic gradient ascent using ADADELTA approach - `BFGS` or `L-BFGS-B` for optimization using second order information (hessian matrix) - `SVRG` for stochastic variance reduced gradient method - `COLLINS-PERCEPTRON` for structured perceptron - `SAPO` for Search-based Probabilistic Online Learning Algorithm (SAPO) (an adapted version) For example possible specification of the optimization options are: :: 1) {'method': 'SGA-ADADELTA' 'regularization_type': {'l1', 'l2'} 'regularization_value': float 'num_epochs': integer 'tolerance': float 'rho': float 'epsilon': float } 2) {'method': 'SGA' or 'SVRG' 'regularization_type': {'l1', 'l2'} 'regularization_value': float 'num_epochs': integer 'tolerance': float 'learning_rate_schedule': one of ("bottu", "exponential_decay", "t_inverse", "constant") 't0': float 'alpha': float 'eta0': float } 3) {'method': 'L-BFGS-B' or 'BFGS' 'regularization_type': 'l2' 'regularization_value': float 'disp': False 'maxls': 20, 'iprint': -1, 'gtol': 1e-05, 'eps': 1e-08, 'maxiter': 15000, 'ftol': 2.220446049250313e-09, 'maxcor': 10, 'maxfun': 15000 } 4) {'method': 'COLLINS-PERCEPTRON' 'regularization_type': {'l1', 'l2'} 'regularization_value': float 'num_epochs': integer 'update_type':{'early', 'max-fast', 'max-exhaustive', 'latest'} 'shuffle_seq': boolean 'beam_size': integer 'avg_scheme': {'avg_error', 'avg_uniform'} 'tolerance': float } 5) {'method': 'SAPO' 'regularization_type': {'l2'} 'regularization_value': float 'num_epochs': integer 'update_type':'early' 'shuffle_seq': boolean 'beam_size': integer 'topK': integer 'tolerance': float } """ pop_keys = set() lambda_type = optimization_options.get("regularization_type") pop_keys.add("regularization_type") if(lambda_type not in {'l1', 'l2'}): # default regularization type is l2 lambda_type = 'l2' #^print("regularization by default is l2") # get the regularization parameter value lambda_val = optimization_options.get("regularization_value") pop_keys.add("regularization_value") if(lambda_val == None): # assign default lambda value lambda_val = 0.0 elif(lambda_val < 0): # regularization should be positive lambda_val = 0.0 # initialization of weight vector w node # w0 = numpy.zeros(len(self.weights)) method = optimization_options.get("method") pop_keys.add("method") if(method not in {"L-BFGS-B", "BFGS", "SGA","SGA-ADADELTA","SVRG","COLLINS-PERCEPTRON", "SAPO"}): # default weight learning/optimization method method = "SGA-ADADELTA" if(method in {"L-BFGS-B", "BFGS"}): # initialize the new optimization options option_keys = set(optimization_options.keys()) - pop_keys options = {elmkey:optimization_options[elmkey] for elmkey in option_keys} optimization_config = {'method':method, 'regularization_value':lambda_val, 'regularization_type':'l2', 'options':options } estimate_weights = self._optimize_scipy elif(method in {"SGA", "SGA-ADADELTA", "SVRG", "COLLINS-PERCEPTRON", "SAPO"}): num_epochs = optimization_options.get("num_epochs") if(type(num_epochs) != int): # default number of epochs if not specified num_epochs = 3 elif(num_epochs < 0): # num_epochs should be positive num_epochs = 3 tolerance = optimization_options.get("tolerance") if(tolerance == None): # default value of tolerance if not specified tolerance = 1e-8 elif(tolerance < 0): tolerance = 1e-8 optimization_config = {'method': method, 'regularization_type':lambda_type, 'regularization_value': lambda_val, 'num_epochs': num_epochs, 'tolerance': tolerance } if(method in {"COLLINS-PERCEPTRON", "SAPO"}): # if segmentation problem the non-entity symbol is specified using this option else it is None seg_other_symbol = optimization_options.get("seg_other_symbol") optimization_config['seg_other_symbol'] = seg_other_symbol # setting beam size beam_size = optimization_options.get("beam_size") # default beam size default_beam = len(self.crf_model.model.Y_codebook) if(type(beam_size) != int): beam_size = default_beam elif(beam_size <= 0 or beam_size > default_beam): beam_size = default_beam optimization_config["beam_size"] = beam_size self.crf_model.beam_size = beam_size # setting update type update_type = optimization_options.get("update_type") if(update_type not in {'early', 'latest', 'max-exhaustive', 'max-fast'}): update_type = 'early' optimization_config["update_type"] = update_type # setting shuffle_seq shuffle_seq = optimization_options.get("shuffle_seq") if(type(shuffle_seq) != bool): shuffle_seq = False optimization_config["shuffle_seq"] = shuffle_seq if(method == "COLLINS-PERCEPTRON"): # getting averaging scheme avg_scheme = optimization_options.get("avg_scheme") if(avg_scheme not in ("avg_uniform", "avg_error", "survival")): avg_scheme = "avg_error" optimization_config["avg_scheme"] = avg_scheme estimate_weights = self._structured_perceptron else: # getting gamma (i.e. learning rate) gamma = optimization_options.get("gamma") if(gamma == None): # use default value gamma = 1 elif(gamma < 0): gamma = 1 optimization_config['gamma'] = gamma # getting topK (i.e. top-K decoded sequences) topK = optimization_options.get("topK") if(topK == None): # use default value topK = 5 elif(topK < 0): topK = 5 optimization_config['topK'] = topK estimate_weights = self._sapo elif(method in {"SGA", "SVRG"}): # get the other parameters to be tuned such as t0 and alpha learning_rate_schedule = optimization_options.get("learning_rate_schedule") if(learning_rate_schedule not in {"bottu", "exponential_decay", "t_inverse", "constant"}): # default learning rate schedule learning_rate_schedule = "t_inverse" optimization_config["learning_rate_schedule"] = learning_rate_schedule t0 = optimization_options.get("t0") if(t0 == None): # use default value t0 = 0.1 elif(t0 < 0): t0 = 0.1 optimization_config['t0'] = t0 if(learning_rate_schedule in {"t_inverse", "exponential_decay"}): # get the alpha parameter a = optimization_options.get("a") if(a == None): # use a default value a = 0.9 elif(a <= 0 or a >= 1): a = 0.9 optimization_config['a'] = a if(method == "SGA"): estimate_weights = self._sga_classic else: estimate_weights = self._sga_svrg elif(method == "SGA-ADADELTA"): estimate_weights = self._sga_adadelta p_rho = optimization_options.get("p_rho") if(p_rho == None): # default value p_rho = 0.95 elif(p_rho < 0): # num_epochs should be positive p_rho = 0.95 epsilon = optimization_options.get("epsilon") if(epsilon == None): # default value of tolerance if not specified epsilon = 1e-6 elif(epsilon < 0): epsilon = 1e-6 optimization_config['p_rho'] = p_rho optimization_config['epsilon'] = epsilon # save the training options self.training_description = optimization_config model_foldername = generate_datetime_str() model_dir = create_directory(model_foldername, create_directory("models", working_dir)) model_name = model_foldername + ".model" self.training_description["model_dir"] = model_dir self.training_description["model_name"] = model_name self.training_description["train_seqs_id"] = seqs_id # if everything is defined correctly then estimate the parameters w_hat = estimate_weights(w0, seqs_id) # update model weights to w_hat self.crf_model.weights = w_hat if(save_model): # pickle the model modelparts_dir = create_directory("model_parts", model_dir) self.crf_model.save_model(modelparts_dir) # cleanup the instance variables self.cleanup()
def _report_training(self): """report training by logging the description to a file""" method = self.training_description["method"] regularization_type = self.training_description["regularization_type"] # regularization parameter lambda C = self.training_description['regularization_value'] model_dir = self.training_description["model_dir"] model_name = self.training_description["model_name"] # log file log_file = os.path.join(model_dir, "crf_training_log.txt") line = "---Model training-- starting time {} \n".format( line += "model name: {} \n".format(model_name) line += "model directory: {} \n".format(model_dir) line += "model type: {} \n".format(self.crf_model.__class__) line += "training method: {} \n".format(method) if(C): line += "type of regularization: {} \n".format(regularization_type) line += "value of regularization: {} \n".format(C) if(method == "SGA"): learning_rate_schedule = self.training_description["learning_rate_schedule"] t0 = self.training_description["t0"] line += "learning rate schedule: {} \n".format(learning_rate_schedule) line += "eta0: {} \n".format(t0) if(learning_rate_schedule in ("t_inverse", "exponential_decay")): # get the alpha parameter a = self.training_description["a"] line += "a: {} \n".format(a) elif(method == "SGA-ADADELTA"): rho = self.training_description["p_rho"] epsilon = self.training_description["epsilon"] line += "p_rho: {} \n".format(rho) line += "epsilon: {} \n".format(epsilon) elif(method in {"SAPO", "COLLINS-PERCEPTRON"}): update_type = self.training_description['update_type'] beam_size = self.training_description['beam_size'] shuffle_seq = self.training_description['shuffle_seq'] line += "update_type: {} \n".format(update_type) line += "beam_size: {} \n".format(beam_size) line += "shuffle_seq: {} \n".format(shuffle_seq) if(method == "COLLINS-PERCEPTRON"): avg_scheme = self.training_description["avg_scheme"] line += "averaging scheme: {} \n".format(avg_scheme) else: gamma = self.training_description['gamma'] topK = self.training_description['topK'] line += "gamma (learning rate): {} \n".format(gamma) line += "topK (number of top decoded seqs): {} \n".format(topK) if(method not in ("L-BFGS-B", "BFGS")): line += "number of epochs: {} \n".format(self.training_description['num_epochs']) # write to file ReaderWriter.log_progress(line, log_file) def _check_reldiff(self, x, y): """calculate relative difference between two numbers Ars: x: float y: float """ tolerance = self.training_description["tolerance"] if(numpy.abs(y)<=tolerance): self._exitloop = True else: if(x != y): reldiff = numpy.abs(x - y) / (numpy.abs(x) + numpy.abs(y)) #print("reldiff = {}".format(reldiff)) if(reldiff <= tolerance): self._exitloop = True else: self._exitloop = False def _optscipy_seqs_loglikelihood(self, w, seqs_id): """compute seqs loglikelihood when using the BFGS and L-BFGS-B optimization options Args: w: weight vector (numpy vector) seqs_id: list of integers representing ids assigned to the sequence """ crf_model = self.crf_model seqs_loglikelihood = crf_model.compute_seqs_loglikelihood(w, seqs_id) # clear cached info crf_model.clear_cached_info(seqs_id) # check for regularization parameter l2 = self.training_description["regularization_value"] if(l2>0): # log(p(Y|X;w)) - lambda/2 * ||w||**2 seqs_loglikelihood = seqs_loglikelihood - ((l2/2) *, w)) # since the optimization will be based on minimization, hence we multiply by -1 seqs_loglikelihood = seqs_loglikelihood * -1 return(seqs_loglikelihood) def _optscipy_seqs_gradient(self, w, seqs_id): """compute seqs gradient when using the BFGS and L-BFGS-B optimization options Args: w: weight vector (numpy vector) seqs_id: list of integers representing ids assigned to the sequence """ crf_model = self.crf_model seqs_grad = crf_model.compute_seqs_gradient(w, seqs_id) # clear cached info crf_model.clear_cached_info(seqs_id) l2 = self.training_description["regularization_value"] if(l2>0): seqs_grad = seqs_grad - (l2 * w) # since the optimization will be based on minimization, hence we multiply by -1 seqs_grad = seqs_grad * -1 return(seqs_grad) def _optimize_scipy(self, w, train_seqs_id): """estimate the parameters w of the model using `scipy optimize function` it uses `optimize.minimize()` function from the scipy package Args: w: weight vector (numpy vector) train_seqs_id: list of integers representing ids of the training sequences """ from scipy import optimize self._report_training() objfunc = self._optscipy_seqs_loglikelihood gradfunc = self._optscipy_seqs_gradient method = self.training_description["method"] options = self.training_description['options'] # to keep track of elapsed time between optimization iterations self._elapsed_time = self._iter_count = 0 result = optimize.minimize(fun = objfunc, x0 = w, args = (train_seqs_id), method = method, jac = gradfunc, options = options, callback = self._track_scipy_optimizer) model_dir = self.training_description["model_dir"] # log file log_file = os.path.join(model_dir, "crf_training_log.txt") line = "---Model training--- end time {} \n".format( line += "\n \n" ReaderWriter.log_progress(line, log_file) # print("results \n {}".format(result)) print("success: ", result['success']) # print(result.keys()) # estimated optimal weights w_hat = result.x return(w_hat) def _track_scipy_optimizer(self, w): """track scipy optimization by logging each iteration Args: w: weight vector (numpy vector) """ # increment iteration count self._iter_count += 1 delta_time = - self._elapsed_time crf_model = self.crf_model # approximate estimation of sum of loglikelihood -- using previous weights train_seqs_id = self.training_description["train_seqs_id"] seqs_loglikelihood = 0 for seq_id in train_seqs_id: seq_loglikelihood = crf_model.seqs_info[seq_id]["loglikelihood"] seqs_loglikelihood += seq_loglikelihood seqs_loglikelihood *= -1 """ use the below command >> to compute the sum of sequences' loglikelihood using the updated/current weights the sum should be decreasing after each iteration for successful training (used as diagnostics) however it is expensive/costly to recompute >>> seqs_loglikelihood = crf_model.compute_seqs_loglikelihood(w, train_seqs_id) """ model_dir = self.training_description["model_dir"] log_file = os.path.join(model_dir, "crf_training_log.txt") line = "--- Iteration {} --- \n".format(self._iter_count) line += "Estimated average negative loglikelihood is {} \n".format(seqs_loglikelihood) line += "Number of seconds spent: {} \n".format(delta_time.total_seconds()) ReaderWriter.log_progress(line, log_file) self._elapsed_time = print("iteration ", self._iter_count) def _identify_violation_indx(self, viol_indx, y_ref_boundaries): """determine the index where the violation occurs violation means when the reference state falls off the specified beam while decoding Args: viol_indx: list of indices where violation occurred while decoding y_ref_boundaries: boundaries of the labels/tags in the reference sequence """ # viol_index is 1-based indexing counter = 0 for boundary in y_ref_boundaries: __, v = boundary if(v >= viol_indx): viol_pos = v viol_boundindex = counter + 1 break counter+= 1 return(viol_pos, viol_boundindex) def _compute_seq_decerror(self, y_ref, y_imposter, viol_pos): """compute the decoding error of a sequence Args: y_ref: reference sequence list of labels y_imposter: imposter/decoded sequence list of labels viol_pos: index where violation occurred, it is identified using :func:`_identify_violation_indx` function """ # print("yref ", y_ref) # print("y_imposter ", y_imposter) # print("viol_pos ", viol_pos) T = len(y_ref[:viol_pos]) #^print("T ", T) #^print("viol_pos ", viol_pos) missmatch = [i for i in range(T) if y_ref[i] != y_imposter[i]] len_diff = len(missmatch) # range of error is [0-1] seq_err_count = float(len_diff/T) return(seq_err_count) def _unpack_windxfval(self, y_windxfval): """unpack the weight indices and corresponding feature values Args: y_windxfval: tuple having two numpy array entries; the first representing the weight indices of the features while the second representing the values that are feature sum/count """ windx, fval = y_windxfval return(windx, fval) def _find_update_violation(self, w, seq_id): """determine the *best* imposter sequence for weight updates Args: w: weight vector (numpy vector) seq_id: integer representing unique id assigned to the sequence """ method = self.training_description['method'] beam_size = self.training_description['beam_size'] update_type = self.training_description['update_type'] topK = self.training_description.get('topK') crf_model = self.crf_model seqs_info = crf_model.seqs_info l = {'Y':(seq_id, )} crf_model.check_cached_info(seq_id, l) y_ref = seqs_info[seq_id]['Y']['flat_y'] y_ref_boundaries = seqs_info[seq_id]['Y']['boundaries'] if(update_type in {'max-fast', 'max-exhaustive', 'latest'}): early_stop = False else: early_stop = True if(not topK): y_imposter, viol_indx = crf_model.viterbi(w, seq_id, beam_size, early_stop, y_ref) y_imposters = [y_imposter] else: y_imposters, viol_indx = crf_model.viterbi(w, seq_id, beam_size, early_stop, y_ref, topK) seq_err_count = None ref_unp_windxfval = None imps_unp_windxfval = None #^print("y_ref ", y_ref) #^print("y_imposter ", y_imposter) # top decoded sequence y_imposter = y_imposters[0] if(not viol_indx): # we can perform full update print("in full update routine ...") T = seqs_info[seq_id]['T'] seq_err_count = self._compute_seq_decerror(y_ref, y_imposter, T) if(seq_err_count or method == "SAPO"): ref_unp_windxfval, imps_unp_windxfval = self._load_gfeatures(seq_id, "globalfeatures", y_imposters, T, len(y_ref_boundaries)) else: if(update_type == "early"): print("in early update routine ...") # viol_index is 1-based indexing earlyviol_indx = viol_indx[0] viol_pos, viol_boundindex = self._identify_violation_indx(earlyviol_indx, y_ref_boundaries) seq_err_count = self._compute_seq_decerror(y_ref, y_imposter, viol_pos) ref_unp_windxfval, imps_unp_windxfval = self._load_gfeatures(seq_id, "globalfeatures_per_boundary", y_imposters, viol_pos, viol_boundindex) elif(update_type == "max-exhaustive"): # max update is only supported for one imposter sequence max_diff = numpy.inf L = crf_model.model.L print("in max-exhaustive update routine ...") test = [] # viol_index is 1-based indexing for i in range(len(viol_indx)): indx = viol_indx[i] if(i == 0): # case of early update index if(L > 1): viol_pos, viol_boundindex = self._identify_violation_indx(indx, y_ref_boundaries) else: viol_pos = indx viol_boundindex = viol_pos seq_err_count = self._compute_seq_decerror(y_ref, y_imposter, viol_pos) else: if(L>1): __, v = y_ref_boundaries[viol_boundindex] viol_pos = v viol_boundindex += 1 else: viol_pos = indx viol_boundindex = viol_pos # seq_err_count = self._compute_seq_decerror(y_ref, y_imposter, viol_pos) ref_unp_windxfval, imps_unp_windxfval = self._load_gfeatures(seq_id, "globalfeatures_per_boundary", y_imposters, viol_pos, viol_boundindex) ref_windx, ref_fval = ref_unp_windxfval imp_windx, imp_fval = imps_unp_windxfval[0] diff =[ref_windx], ref_fval) -[imp_windx], imp_fval) test.append(diff) # print("diff = {}, max_diff = {} ".format(diff, max_diff)) if(diff <= max_diff): # using less than or equal would allow for getting the longest sequence having max difference max_diff = diff ref_unp_windxfval = (ref_windx, ref_fval) imp_unp_windxfval = (imp_windx, imp_fval) imps_unp_windxfval = [imp_unp_windxfval] # print("test ", test) elif(update_type == "max-fast"): # based on empirical observation, the last violation index (i.e. where the beam falls off) is almost always yielding the max violation # this is a heuristic, for an exhaustive procedure, choose `max-exhaustive` # max update is only supported for one imposter sequence max_diff = numpy.inf L = crf_model.model.L print("in max-fast update routine ...") # viol_index is 1-based indexing lastviol_indx = viol_indx[-1] viol_pos, viol_boundindex = self._identify_violation_indx(lastviol_indx, y_ref_boundaries) seq_err_count = self._compute_seq_decerror(y_ref, y_imposter, viol_pos) ref_unp_windxfval, imps_unp_windxfval = self._load_gfeatures(seq_id, "globalfeatures_per_boundary", y_imposters, viol_pos, viol_boundindex) elif(update_type == 'latest'): # to implement lastest update at some point.. pass return(ref_unp_windxfval, imps_unp_windxfval, seq_err_count) def _load_gfeatures(self, seq_id, gfeatures_type, y_imposters, ypos_indx, boundpos_indx): """load the global features of the reference and imposter/decoded sequence Args: seq_id: id of the sequence gfeatures_type: determine the representation either aggregated or by boundary y_imposters: list of imposter sequences ypos_indx: index of the considered end of the label sequence boundpos_indx: index of the boundary corresponding to the identified `ypos_indx` """ seg_other_symbol = self.training_description['seg_other_symbol'] crf_model = self.crf_model seqs_info = crf_model.seqs_info y_ref_boundaries = seqs_info[seq_id]['Y']['boundaries'] if(gfeatures_type == "globalfeatures"): per_boundary = False y_ref_boundaries = None else: per_boundary = True # to assign y_ref_boundries here -> y_ref_boundaries = y_ref_boundaries[:boundpos_indx] l = {gfeatures_type:(seq_id, per_boundary)} crf_model.check_cached_info(seq_id, l) ref_gfeatures = seqs_info[seq_id][gfeatures_type] if(y_ref_boundaries): y_ref_windxfval = crf_model.represent_globalfeature(ref_gfeatures, y_ref_boundaries[:boundpos_indx]) else: y_ref_windxfval = seqs_info[seq_id][gfeatures_type] #ref_unp_windxfval = self._unpack_windxfval(y_ref_windxfval) # generate global features for the imposters imposters_windxfval = [] for y_imposter in y_imposters: # generate global features for the current imposter imposter_gfeatures_perboundary, y_imposter_boundaries = crf_model.load_imposter_globalfeatures(seq_id, y_imposter[:ypos_indx], seg_other_symbol) #^print("imposter_gfeatures_perboundary ", imposter_gfeatures_perboundary) #^print("imposter y_boundaries ", y_imposter_boundaries) y_imposter_windxfval = crf_model.represent_globalfeature(imposter_gfeatures_perboundary, y_imposter_boundaries) imposters_windxfval.append(y_imposter_windxfval) return(y_ref_windxfval, imposters_windxfval) def _update_weights_sapo(self, w, ref_unp_windxfval, imps_unp_windxfval, prob_vec): """update weight vector for the SAPO method Args: w: weight vector (numpy vector) ref_unp_windxfval: tuple of two numpy array elements representing the weight indices and corresponding feature sum/count of the reference sequence imps_unp_windxfval: list of tuples each comprising two numpy array elements representing the weight indices and corresponding feature sum/count of the imposter sequences prob_vec: numpy vector representing the probability of each imposter sequence """ gamma = self.training_description['gamma'] # update weights using the decoded sequences for i in range(len(imps_unp_windxfval)): windx, fval = imps_unp_windxfval[i] w[windx] -= (gamma*prob_vec[i]) * fval # update weights using the reference sequence windx, fval = ref_unp_windxfval w[windx] += gamma * fval def _compute_probvec_sapo(self, w, imps_unp_windxfval): """compute the probabilty of each imposter sequence in the SAPO algorithm Args: w: weight vector (numpy vector) imps_unp_windxfval: list of dictionaries (unpacked) representing the weight indices and corresponding feature sum/count of the imposter sequences """ # normalize num_imposters = len(imps_unp_windxfval) ll_vec = numpy.zeros(num_imposters) for i in range(num_imposters): windx, fval = imps_unp_windxfval[i] ll_vec[i] =[windx], fval) Z = vectorized_logsumexp(ll_vec) prob_vec = numpy.exp(ll_vec - Z) # print("prob_vec ", prob_vec) return(prob_vec) def _sapo(self, w, train_seqs_id): """implements Search-based Probabilistic Online Learning Algorithm (SAPO) this implementation adapts it to 'violation-fixing' framework (i.e. inexact search is supported) .. see:: original paper at .. note:: the regularization is based on averaging rather than l2 as it seems to be consistent during training while using exact or inexact search """ self._report_training() num_epochs = self.training_description["num_epochs"] # regularization_type = self.training_description["regularization_type"] # regularization parameter lambda # C = self.training_description['regularization_value'] # gamma = self.training_description['gamma'] shuffle_seq = self.training_description['shuffle_seq'] model_dir = self.training_description["model_dir"] log_file = os.path.join(model_dir, "crf_training_log.txt") N = len(train_seqs_id) crf_model = self.crf_model # instance variable to keep track of elapsed time between optimization iterations self._elapsed_time = self._exitloop = False avg_error_list = [0] w_avg = numpy.zeros(len(w), dtype='longdouble') for k in range(num_epochs): seq_left = N error_count = 0 if(shuffle_seq): numpy.random.shuffle(train_seqs_id) for seq_id in train_seqs_id: ref_unp_windxfval, imps_unp_windxfval, seq_err_count = self._find_update_violation(w, seq_id) prob_vec = self._compute_probvec_sapo(w, imps_unp_windxfval) self._update_weights_sapo(w, ref_unp_windxfval, imps_unp_windxfval, prob_vec) # regularize the weights # reg = -(C/N)* w # w += gamma*reg w_avg += w crf_model.clear_cached_info([seq_id]) seq_left -= 1 #print('seq_err_count ', seq_err_count) if(seq_err_count): error_count += seq_err_count # print("error count {}".format(error_count)) print("sequences left {}".format(seq_left)) avg_error_list.append(float(error_count/N)) self._track_perceptron_optimizer(w, k, avg_error_list) ReaderWriter.dump_data(w_avg/((k+1)*N), os.path.join(model_dir, "model_avgweights_epoch_{}".format(k+1))) print("average error : {}".format(avg_error_list[1:])) # print("self._exitloop {}".format(self._exitloop)) if(self._exitloop): break self._elapsed_time = line = "---Model training--- end time {} \n".format( ReaderWriter.log_progress(line, log_file) w = w_avg/(num_epochs*N) ReaderWriter.dump_data(avg_error_list, os.path.join(model_dir, 'avg_decodingerror_training')) return(w) def _update_weights_perceptron(self, w, ref_unp_windxfval, imp_unp_windxfval): """update weight vector for the COLLINS-PERCEPTRON method Args: w: weight vector (numpy vector) ref_unp_windxfval: dictionary (unpacked) representing the weight indices and corresponding feature sum/count of the reference sequence imps_unp_windxfval: list of dictionaries (unpacked) representing the weight indices and corresponding feature sum/count of the imposter sequences """ ref_windx, ref_fval = ref_unp_windxfval imp_windx, imp_fval = imp_unp_windxfval w[ref_windx] += ref_fval w[imp_windx] -= imp_fval def _structured_perceptron(self, w, train_seqs_id): """implements structured perceptron algorithm in particular the average perceptron it was introduced by Michael Collins in 2002 (see his paper this implementation supports different averaging schemes for the weight learning Args: w: weight vector (numpy vector) seqs_id: list of integers representing ids assigned to the sequence """ self._report_training() num_epochs = self.training_description["num_epochs"] avg_scheme = self.training_description["avg_scheme"] shuffle_seq = self.training_description["shuffle_seq"] model_dir = self.training_description["model_dir"] log_file = os.path.join(model_dir, "crf_training_log.txt") N = len(train_seqs_id) crf_model = self.crf_model # instance variable to keep track of elapsed time between optimization iterations self._elapsed_time = self._exitloop = False if(avg_scheme in {"avg_error", "avg_uniform"}): # accumulated sum of estimated weights w_avg = numpy.zeros(len(w), dtype = "longdouble") avg_error_list = [0] num_upd = 0 for k in range(num_epochs): seq_left = N error_count = 0 if(shuffle_seq): numpy.random.shuffle(train_seqs_id) for seq_id in train_seqs_id: print("sequences left {}".format(seq_left)) ref_unp_windxfval, imps_unp_windxfval, seq_err_count = self._find_update_violation(w, seq_id) # if decoding errors with the current weight occurs #^print("seq_err_count ", seq_err_count) #^print("y_ref_windxfval ", y_ref_windxfval) if(seq_err_count): error_count += seq_err_count if(avg_scheme == "avg_error"): # consider/emphasize more on previous weights that have small average error decoding per sequence w_avg += (1-seq_err_count) * w num_upd += (1-seq_err_count) else: w_avg += w num_upd += 1 # update current weight self._update_weights_perceptron(w, ref_unp_windxfval, imps_unp_windxfval[0]) crf_model.clear_cached_info([seq_id]) seq_left -= 1 # print("error count {}".format(error_count)) avg_error_list.append(float(error_count/N)) self._track_perceptron_optimizer(w, k, avg_error_list) if(num_upd): w_dump = w_avg/num_upd else: w_dump = w_avg ReaderWriter.dump_data(w_dump, os.path.join(model_dir, "model_avgweights_epoch_{}".format(k+1))) print("average error : {}".format(avg_error_list[1:])) # print("self._exitloop {}".format(self._exitloop)) if(self._exitloop): break self._elapsed_time = if(num_upd): w = w_avg/num_upd line = "---Model training--- end time {} \n".format( ReaderWriter.log_progress(line, log_file) ReaderWriter.dump_data(avg_error_list, os.path.join(model_dir, 'avg_decodingerror_training')) return(w) def _track_perceptron_optimizer(self, w, k, avg_error_list): """track search based optimized (such as SAPO and COLLINS-PERCEPTRON) by logging each iteration Args: w: weight vector (numpy vector) k: current epoch avg_error_list: list of the decoding errors in each previous epochs """ delta_time = - self._elapsed_time self._check_reldiff(avg_error_list[-2], avg_error_list[-1]) model_dir = self.training_description["model_dir"] log_file = os.path.join(model_dir, "crf_training_log.txt") line = "--- Iteration {} --- \n".format(k+1) line += "Average percentage of decoding error: {} \n".format(avg_error_list[-1]*100) line += "Number of seconds spent: {} \n".format(delta_time.total_seconds()) ReaderWriter.log_progress(line, log_file) # dump the learned weights for every pass ReaderWriter.dump_data(w, os.path.join(model_dir, "model_weights_epoch_{}".format(k+1))) def _sga_adadelta(self, w, train_seqs_id): """implements stochastic gradient ascent using adaptive approach of ADADELTA the original paper is found in Args: w: weight vector (numpy vector) train_seqs_id: list of integers representing ids assigned to the sequence """ self._report_training() crf_model = self.crf_model num_epochs = self.training_description["num_epochs"] regularization_type = self.training_description["regularization_type"] # regularization parameter lambda C = self.training_description['regularization_value'] # number of training sequences N = len(train_seqs_id) model_dir = self.training_description["model_dir"] log_file = os.path.join(model_dir, "crf_training_log.txt") # keeps track of the log-likelihood of a sequence before weight updating seqs_loglikelihood_vec = numpy.zeros(N) seqs_id_mapper = {seq_id:unique_id for unique_id, seq_id in enumerate(train_seqs_id)} # step size decides the number of data points to average in the seqs_loglikelihood_vec # using 10% of data points step_size = round(N * 0.1) if step_size == 0: step_size = 1 mean_cost_vec = [0] p_rho = self.training_description["p_rho"] epsilon = self.training_description["epsilon"] E_g2 = numpy.zeros(len(w), dtype="longdouble") E_deltaw2 = numpy.zeros(len(w), dtype="longdouble") if(regularization_type == "l1"): u = 0 q = numpy.zeros(len(w), dtype = "longdouble") # gradient grad = numpy.zeros(len(w), dtype = "longdouble") # instance variable to keep track of elapsed time between optimization iterations self._elapsed_time = self._exitloop = False for k in range(num_epochs): # shuffle sequences at the beginning of each epoch numpy.random.shuffle(train_seqs_id) numseqs_left = N print("k ",k) for seq_id in train_seqs_id: # print(seq_id) # print("first seqs_info[{}]={}".format(seq_id, crf_model.seqs_info[seq_id])) seq_loglikelihood = crf_model.compute_seq_loglikelihood(w, seq_id) seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood target_indx = crf_model.compute_seq_gradient(w, seq_id, grad) if(C): if(regularization_type == 'l2'): seq_loglikelihood += - ((C/N) * (1/2) *, w)) grad -= ((C/N)* w) elif(regularization_type == 'l1'): seq_loglikelihood += - (C/N) * numpy.sum(numpy.abs(w)) # update the computed sequence loglikelihood by adding the regularization term contribution seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood # accumulate gradient E_g2 = p_rho * E_g2 + (1-p_rho) * numpy.square(grad) RMS_g = numpy.sqrt(E_g2 + epsilon) RMS_deltaw = numpy.sqrt(E_deltaw2 + epsilon) ratio = (RMS_deltaw/RMS_g) deltaw = ratio * grad E_deltaw2 = p_rho * E_deltaw2 + (1-p_rho) * numpy.square(deltaw) w += deltaw if(regularization_type == "l1"): u += ratio * (C/N) w_upd, q_upd = self._apply_l1_penalty(w, q, u, target_indx) w = w_upd q = q_upd else: # accumulate gradient fval = grad[target_indx] E_g2 = p_rho * E_g2 E_g2[target_indx] += (1-p_rho) * numpy.square(fval) RMS_g = numpy.sqrt(E_g2 + epsilon) RMS_deltaw = numpy.sqrt(E_deltaw2 + epsilon) ratio = (RMS_deltaw/RMS_g) deltaw = ratio[target_indx] * fval E_deltaw2 = p_rho * E_deltaw2 E_deltaw2[target_indx] += (1-p_rho) * numpy.square(deltaw) w[target_indx] += deltaw # print("second seqs_info[{}]={}".format(seq_id, crf_model.seqs_info[seq_id])) # clean cached info crf_model.clear_cached_info([seq_id]) numseqs_left -= 1 # print("third seqs_info[{}]={}".format(seq_id, crf_model.seqs_info[seq_id])) # reset the gradient grad.fill(0) print("num seqs left: {}".format(numseqs_left)) seqs_cost_vec = [numpy.mean(seqs_loglikelihood_vec[i:i+step_size]) for i in range(0, N, step_size)] # to consider plotting this vector mean_cost_vec.append(numpy.mean(seqs_loglikelihood_vec)) self._track_sga_optimizer(w, seqs_cost_vec, mean_cost_vec, k) if(self._exitloop): break self._elapsed_time = line = "---Model training--- end time {} \n".format( ReaderWriter.log_progress(line, log_file) ReaderWriter.dump_data(mean_cost_vec, os.path.join(model_dir, 'avg_loglikelihood_training')) return(w) def _sga_classic(self, w, train_seqs_id): """implements stochastic gradient ascent Args: w: weight vector (numpy vector) train_seqs_id: list of integers representing ids assigned to the sequence """ self._report_training() crf_model = self.crf_model num_epochs = self.training_description["num_epochs"] regularization_type = self.training_description["regularization_type"] # regularization parameter lambda C = self.training_description['regularization_value'] # number of training sequences N = len(train_seqs_id) model_dir = self.training_description["model_dir"] log_file = os.path.join(model_dir, "crf_training_log.txt") # keeps track of the log-likelihood of a sequence before weight updating seqs_loglikelihood_vec = numpy.zeros(N) seqs_id_mapper = {seq_id:unique_id for unique_id, seq_id in enumerate(train_seqs_id)} # step size decides the number of data points to average in the seqs_loglikelihood_vec # using 10% of data points step_size = round(N * 0.1) if step_size == 0: step_size = 1 mean_cost_vec = [0] # instance variable to keep track of elapsed time between optimization iterations self._elapsed_time = self._exitloop = False if(regularization_type == "l1"): u = 0 q = numpy.zeros(len(w), dtype = "longdouble") learning_rate_schedule = self.training_description["learning_rate_schedule"] t0 = self.training_description["t0"] # 0<a<1 -- a parameter should be between 0 and 1 exclusively a = self.training_description["a"] t = 0 # gradient grad = numpy.zeros(len(w), dtype = "longdouble") for k in range(num_epochs): # shuffle sequences at the beginning of each epoch numpy.random.shuffle(train_seqs_id) numseqs_left = N for seq_id in train_seqs_id: # compute/update learning rate if(learning_rate_schedule == "bottu"): eta = C/(t0 + t) elif(learning_rate_schedule == "exponential_decay"): eta = t0*a**(t/N) elif(learning_rate_schedule == "t_inverse"): eta = t0/(1 + a*(t/N)) elif(learning_rate_schedule == "constant"): eta = t0 # print("eta {}".format(eta)) # print(seq_id) seq_loglikelihood = crf_model.compute_seq_loglikelihood(w, seq_id) seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood target_index = crf_model.compute_seq_gradient(w, seq_id, grad) # print("seq_grad {}".format(seq_grad)) if(C): if(regularization_type == 'l2'): seq_loglikelihood += - ((C/N) * (1/2) *, w)) grad -= ((C/N)* w) w += eta * grad elif(regularization_type == 'l1'): seq_loglikelihood += - (C/N) * numpy.sum(numpy.abs(w)) u += eta * (C/N) w_upd, q_upd = self._apply_l1_penalty(w, q, u, target_index) w = w_upd q = q_upd # update the computed sequence loglikelihood by adding the regularization term contribution seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood else: # print("fval {}".format(fval)) w[target_index] += eta * grad[target_index] t += 1 # clean cached info crf_model.clear_cached_info([seq_id]) # reset the gradient grad.fill(0) numseqs_left -= 1 print("num seqs left: {}".format(numseqs_left)) seqs_cost_vec = [numpy.mean(seqs_loglikelihood_vec[i:i+step_size]) for i in range(0, N, step_size)] # to consider plotting this vector mean_cost_vec.append(numpy.mean(seqs_loglikelihood_vec)) self._track_sga_optimizer(w, seqs_cost_vec, mean_cost_vec, k) if(self._exitloop): break self._elapsed_time = line = "---Model training--- end time {} \n".format( ReaderWriter.log_progress(line, log_file) ReaderWriter.dump_data(mean_cost_vec, os.path.join(model_dir, 'avg_loglikelihood_training')) return(w) def _sga_svrg(self, w, train_seqs_id): """implements the stochastic variance reduced gradient The algorithm is reported in `Johnson R, Zhang T. Accelerating Stochastic Gradient Descent using Predictive Variance Reduction. <>`__ Args: w: weight vector (numpy vector) train_seqs_id: list of integers representing sequences IDs """ # keep the original number of epochs requested num_epochs = self.training_description["num_epochs"] # run stochastic gradient ascent to initialize the weights self.training_description["num_epochs"] = 1 # current snapshot of w (i.e. w tilda) w_tilda_c = self._sga_classic(w, train_seqs_id) self.cleanup() self.training_description["num_epochs"] = num_epochs crf_model = self.crf_model regularization_type = self.training_description["regularization_type"] # regularization parameter lambda C = self.training_description['regularization_value'] # number of training sequences N = len(train_seqs_id) model_dir = self.training_description["model_dir"] log_file = os.path.join(model_dir, "crf_training_log.txt") # keeps track of the log-likelihood of a sequence before weight updating seqs_loglikelihood_vec = numpy.zeros(N) seqs_id_mapper = {seq_id:unique_id for unique_id, seq_id in enumerate(train_seqs_id)} # step size decides the number of data points to average in the seqs_loglikelihood_vec # using 10% of data points step_size = round(N * 0.1) if step_size == 0: step_size = 1 mean_cost_vec = [0] if(regularization_type == "l1"): u = 0 q = numpy.zeros(len(w), dtype = "longdouble") eta = self.training_description["t0"] m = 2*N saved_grad = {} # gradient grad = numpy.zeros(len(w), dtype = "longdouble") # instance variable to keep track of elapsed time between optimization iterations self._elapsed_time = self._exitloop = False for s in range(num_epochs): print("stage {}".format(s)) # ################################### # compute the average gradient using the snapshot of w (i.e. w tilda) mu_grad = numpy.zeros(len(w_tilda_c), dtype = "longdouble") # compute average gradient seqs_left = N for seq_id in train_seqs_id: target_indx = crf_model.compute_seq_gradient(w_tilda_c, seq_id, grad) fval = grad[target_indx] mu_grad[target_indx] += fval crf_model.clear_cached_info([seq_id]) saved_grad[seq_id] = (target_indx, fval) # reset grad grad.fill(0) seqs_left -= 1 print("average gradient phase: {} seqs left".format(seqs_left)) mu_grad /= N ####################################### w = numpy.copy(w_tilda_c) for t in range(m): seq_id = numpy.random.choice(train_seqs_id, 1)[0] print("round {} out of {}".format(t+1, m)) seq_loglikelihood = crf_model.compute_seq_loglikelihood(w, seq_id) seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood target_indx = crf_model.compute_seq_gradient(w, seq_id, grad) fval = grad[target_indx] if(C): if(regularization_type == 'l2'): seq_loglikelihood += - ((C/N) * (1/2) *, w)) grad -= ((C/N)* w) grad[saved_grad[seq_id][0]] -= saved_grad[seq_id][1] grad += mu_grad w += eta * grad elif(regularization_type == 'l1'): seq_loglikelihood += - (C/N) * numpy.sum(numpy.abs(w)) u += eta * (C/N) grad[saved_grad[seq_id][0]] -= saved_grad[seq_id][1] grad += mu_grad w_upd, q_upd = self._apply_l1_penalty(w, q, u, target_indx) w = w_upd q = q_upd # update the computed sequence loglikelihood by adding the regularization term contribution seqs_loglikelihood_vec[seqs_id_mapper[seq_id]] = seq_loglikelihood else: w[target_indx] += eta * (fval - saved_grad[seq_id][1]) w += eta * mu_grad t += 1 # clean cached info crf_model.clear_cached_info([seq_id]) grad.fill(0) w_tilda_c = w seqs_cost_vec = [numpy.mean(seqs_loglikelihood_vec[i:i+step_size]) for i in range(0, N, step_size)] # to consider plotting this vector mean_cost_vec.append(numpy.mean(seqs_loglikelihood_vec)) self._track_sga_optimizer(w, seqs_cost_vec, mean_cost_vec, s) if(self._exitloop): break self._elapsed_time = line = "---Model training--- end time {} \n".format( ReaderWriter.log_progress(line, log_file) ReaderWriter.dump_data(mean_cost_vec, os.path.join(model_dir, 'avg_loglikelihood_training')) return(w) def _apply_l1_penalty(self, w, q, u, w_indx): """apply l1 regularization to the weights it uses the approach of Tsuruoka et al. Stochastic gradient descent training for L1-regularized log-linear models with cumulative penalty Args: w: weight vector (numpy vector) q: total L1 penalty that current weights (corresponding to the features) did receive up to the current time u: absolute value of total L1 penalty that each weight could receive up to the current time w_indx: weight indices corresponding to the current features under update TODO: vectorize this function """ for indx in w_indx: z = w[indx] # print("z is {}".format(z)) # print("q[indx] is {}".format(q[indx])) if(w[indx] > 0): # print("we want the max between 0 and {}".format(w[indx] - (u + q[indx]))) w[indx] = numpy.max([0, w[indx] - (u + q[indx])]) elif(w[indx] < 0): # print("we want the min between 0 and {}".format(w[indx] + (u - q[indx]))) w[indx] = numpy.min([0, w[indx] + (u - q[indx])]) # print("z is {}".format(z)) # print("w[indx] is {}".format(w[indx])) q[indx] = q[indx] + (w[indx] - z) return((w, q)) # print("q[indx] becomes {}".format(q[indx])) def _track_sga_optimizer(self, w, seqs_loglikelihood, mean_loglikelihood, k): """track stochastic gradient ascent optimizers by logging each iteration Args: w: weight vector (numpy vector) seqs_loglikelihood: numpy vector representing the average loglikelihood of batches of sequences mean_loglikelihood: mean of the seqs_loglikelihood vector k: current epoch """ delta_time = - self._elapsed_time self._check_reldiff(mean_loglikelihood[-2], mean_loglikelihood[-1]) epoch_num = k # log file model_dir = self.training_description["model_dir"] log_file = os.path.join(model_dir, "crf_training_log.txt") line = "--- Epoch/pass {} --- \n".format(epoch_num+1) line += "Estimated training cost (average loglikelihood) is {} \n".format(mean_loglikelihood[-1]) line += "Number of seconds spent: {} \n".format(delta_time.total_seconds()) ReaderWriter.log_progress(line, log_file)
[docs] def cleanup(self): """End of training -- cleanup""" # reset iteration counter self._iter_count = None # reset elapsed time between iterations self._elapsed_time = None self._exitloop = None
[docs]class SeqDecodingEvaluator(object): """Evaluator class to evaluate performance of the models Args: model_repr: the CRF model representation that has a suffix of `ModelRepresentation` such as :class:`HOCRFADModelRepresentation` Attributes: model_repr: the CRF model representation that has a suffix of `ModelRepresentation` such as :class:`HOCRFADModelRepresentation` .. note:: this class does not support evaluation of segment learning (i.e. notations that include IOB2/BIO notation) """ def __init__(self, model_repr): self.model_repr = model_repr
[docs] def compute_states_confmatrix(self, Y_seqs_dict): """compute/generate the confusion matrix for each state Args: Y_seqs_dict: dictionary where each sequence has the reference label sequence and its corresponding predicted sequence. It has the following form ``{seq_id:{'Y_ref':[reference_ylabels], 'Y_pred':[predicted_ylabels]}}`` """ Y_codebook = self.model_repr.Y_codebook M = len(Y_codebook) # add another state in case unseen states occur in the test data self.model_confusion_matrix = numpy.zeros((M+1, M+1), dtype="float") for seq_id in Y_seqs_dict: Y_pred = Y_seqs_dict[seq_id]['Y_pred'] Y_ref = Y_seqs_dict[seq_id]['Y_ref'] self._compute_model_confusionmatrix(self.map_states_to_num(Y_ref, Y_codebook, M), self.map_states_to_num(Y_pred, Y_codebook, M) ) statelevel_confmatrix = self._generate_statelevel_confusion_matrix() return(statelevel_confmatrix)
def _generate_statelevel_confusion_matrix(self): model_confusion_matrix = self.model_confusion_matrix num_states = model_confusion_matrix.shape[0] total = model_confusion_matrix.sum() statelevel_confmatrix = numpy.zeros((num_states, 2, 2), dtype='float') for i in range(num_states): tp = model_confusion_matrix[i, i] fp = model_confusion_matrix[i, :].sum() - tp fn = model_confusion_matrix[:, i].sum() - tp tn = total - (tp+fp+fn) statelevel_confmatrix[i] = numpy.array([[tp, fn], [fp, tn]]) return(statelevel_confmatrix)
[docs] def get_performance_metric(self, taglevel_performance, metric, exclude_states=[]): """compute the performance of the model using a requested metric Args: taglevel_performance: `numpy` array with Mx2x2 dimension. For every state code a 2x2 confusion matrix is included. It is computed using :func:`compute_model_performance` metric: evaluation metric that could take one of ``{'f1', 'precision', 'recall', 'accuracy'}`` Keyword Arguments: exclude_states: list (default empty list) of states to exclude from the computation. Usually, in NER applications the non-entity symbol such as 'O' is excluded from the computation. Example: If ``exclude_states = ['O']``, this will replicate the behavior of `conlleval script <>`__ """ Y_codebook = self.model_repr.Y_codebook # do not include 'exclude states' in the computation exclude_indices = [Y_codebook[state] for state in exclude_states] # total number of states plus 1 M = len(Y_codebook) + 1 include_indices = list(set(range(M)) - set(exclude_indices)) # perform sum across all layers to get micro-average collapsed_performance = taglevel_performance[include_indices].sum(axis = 0) # print("collapsed performance \n {}".format(collapsed_performance)) tp = collapsed_performance[0,0] fp = collapsed_performance[1,0] fn = collapsed_performance[0,1] tn = collapsed_performance[1,1] perf_measure = 0 try: if(metric == "f1"): precision = tp/(tp + fp) recall = tp/(tp + fn) f1 = (2 * precision * recall)/(precision + recall) print("f1 {}".format(f1)) perf_measure = f1 elif(metric == "precision"): precision = tp/(tp + fp) print("precision {}".format(precision)) perf_measure = precision elif(metric == "recall"): recall = tp/(tp + fn) print("recall {}".format(recall)) perf_measure = recall elif(metric == "accuracy"): accuracy = (tp + tn)/(tp + fp + fn + tn) print("accuracy {}".format(accuracy)) perf_measure = accuracy except ZeroDivisionError as e: print("dividing by Zero: check/investigate the confusion matrix") finally: return(perf_measure)
[docs] def map_states_to_num(self, Y, Y_codebook, M): """map states to their code/number using the `Y_codebook` Args: Y: list representing label sequence Y_codebook: dictionary containing the states as keys and the assigned unique code as values M: number of states .. note:: we give one unique index for tags that did not occur in the training data such as len(Y_codebook) """ Y_coded = [Y_codebook[state] if state in Y_codebook else M for state in Y] # print("Y_coded {}".format(Y_coded)) return(Y_coded)
def _compute_model_confusionmatrix(self, Y_ref, Y_pred): """compute confusion matrix on the level of the tag/state Args: Y_ref: list of reference label sequence (represented by the states code) Y_pred: list of predicted label sequence (represented by the states code) """ Y_ref = numpy.asarray(Y_ref) Y_pred = numpy.asarray(Y_pred) model_confusion_matrix = self.model_confusion_matrix for i in range(len(Y_ref)): ref_state = Y_ref[i] pred_state = Y_pred[i] model_confusion_matrix[ref_state, pred_state] += 1
[docs]class Evaluator(object): """Evaluator class to evaluate performance of the models Args: model_repr: the CRF model representation that has a suffix of `ModelRepresentation` such as :class:`HOCRFADModelRepresentation` Attributes: model_repr: the CRF model representation that has a suffix of `ModelRepresentation` such as :class:`HOCRFADModelRepresentation` .. note:: this class is **EXPERIMENTAL/work in progress*** and does not support evaluation of segment learning. Use instead :class:`SeqDecodingEvaluator` for evaluating models learned using **sequence** learning. """ def __init__(self, model_repr): self.model_repr = model_repr
[docs] def transform_codebook(self, Y_codebook, prefixes): """map states coded in BIO notation to their original states value Args: Y_codebook: dictionary of states each assigned a unique integer prefixes: tuple of prefix notation used such as ("B-","I-") for BIO """ state_mapper = {} for state in Y_codebook: if(state != "O"): for prefix in prefixes: elems = state.split(prefix) if(len(elems)>1): new_state = elems[-1] state_mapper[state] = new_state break else: state_mapper[state] = state return(state_mapper)
[docs] def compute_model_performance(self, Y_seqs_dict, metric, output_file, states_notation): r"""compute the performance of the model Args: Y_seqs_dict: dictionary where each sequence has the reference label sequence and its corresponding predicted sequence. It has the following form ``{seq_id:{'Y_ref':[reference_ylabels], 'Y_pred':[predicted_ylabels]}}`` metric: evaluation metric that could take one of {'f1', 'precision', 'recall', 'accuracy'} output_file: file where to output the evaluation result states_notation: notation used to code the state (i.e. BIO) """ Y_codebook = self.model_repr.Y_codebook if(states_notation == "BIO"): prefixes = ("B-", "I-") state_mapper = self.transform_codebook(Y_codebook, prefixes) transformed_codebook = {} counter = 0 for new_state in state_mapper.values(): if(new_state not in transformed_codebook): transformed_codebook[new_state] = counter counter += 1 else: state_mapper = {state:state for state in Y_codebook} transformed_codebook = Y_codebook transformed_codebook_rev = {code:state for state, code in transformed_codebook.items()} #^print("original Y_codebook ", Y_codebook) #^print("state_mapper ", state_mapper) #^print("transformed_codebook ", transformed_codebook) M = len(transformed_codebook) # add another state in case unseen states occur in the test data model_taglevel_performance = numpy.zeros((M + 1, 2, 2)) for seq_id in Y_seqs_dict: Y_pred = Y_seqs_dict[seq_id]['Y_pred'] Y_ref = Y_seqs_dict[seq_id]['Y_ref'] #^print("Y_pred ", Y_pred) #^print("Y_ref ", Y_ref) taglevel_performance = self.compute_tags_confusionmatrix(self.map_states_to_num(Y_ref, state_mapper, transformed_codebook, M), self.map_states_to_num(Y_pred, state_mapper,transformed_codebook, M), transformed_codebook_rev, M) # print("taglevel_performance {}".format(taglevel_performance)) # print("tagging performance \n {}".format(taglevel_performance)) model_taglevel_performance += taglevel_performance #^print("model_taglevel_performance ", model_taglevel_performance) # perform sum across all layers to get micro-average collapsed_performance = model_taglevel_performance.sum(axis = 0) # print("collapsed performance \n {}".format(collapsed_performance)) tp = collapsed_performance[0,0] fp = collapsed_performance[0,1] fn = collapsed_performance[1,0] tn = collapsed_performance[1,1] perf_measure = 0 if(metric == "f1"): precision = tp/(tp + fp) recall = tp/(tp + fn) f1 = 2 * ((precision * recall)/(precision + recall)) print("f1 {}".format(f1)) perf_measure = f1 elif(metric == "precision"): precision = tp/(tp + fp) print("precision {}".format(precision)) perf_measure = precision elif(metric == "recall"): recall = tp/(tp + fn) print("recall {}".format(recall)) perf_measure = recall elif(metric == "accuracy"): accuracy = (tp + tn)/(tp + fp + fn + tn) print("accuracy {}".format(accuracy)) perf_measure = accuracy with open(output_file, mode = 'w') as f: f.write("The performance of the model based on the {} measure is {}\n".format(metric, perf_measure)) f.write('Confusion matrix: tp:{} fp:{} fn:{} tn:{}\n'.format(tp, fp, fn, tn)) return(perf_measure)
[docs] def map_states_to_num(self, Y, state_mapper, transformed_codebook, M): """map states to their code/number using the `Y_codebook` Args: Y: list representing label sequence state_mapper: mapper between the old and new generated states generated from :func:`tranform_codebook` method trasformed_codebook: the transformed codebook of the new identified states M: number of states .. note:: we give one unique index for tags that did not occur in the training data such as len(Y_codebook) """ # Y_coded = [] # for state in Y: # mapped_state = state_mapper[state] # if(mapped_state in transformed_codebook): # Y_coded.append(transformed_codebook[mapped_state]) # else: # Y_coded.append(M) Y_coded = [transformed_codebook[state_mapper[state]] if state_mapper.get(state) in transformed_codebook else M for state in Y] # print("Y_coded {}".format(Y_coded)) return(Y_coded)
[docs] def compute_tags_confusionmatrix(self, Y_ref, Y_pred, transformed_codebook_rev, M): """compute confusion matrix on the level of the tag/state Args: Y_ref: list of reference label sequence (represented by the states code) Y_pred: list of predicted label sequence (represented by the states code) transformed_codebook: the transformed codebook of the new identified states M: number of states """ #^print("Y_ref coded ", Y_ref) #^print("Y_pred coded ", Y_pred) detected_statescode = set(Y_ref) Y_ref = numpy.asarray(Y_ref) Y_pred = numpy.asarray(Y_pred) # print("Y_ref as numpy array {}".format(Y_ref)) tagslevel_performance = numpy.zeros((M + 1, 2, 2)) for statecode in detected_statescode: # get all indices of the target tag (gold-standard) tag_indx_origin = numpy.where(Y_ref == statecode)[0] # get all indices of the target tag (predicted) tag_indx_pred = numpy.where(Y_pred == statecode)[0] tag_tp = len(numpy.where(numpy.in1d(tag_indx_origin, tag_indx_pred))[0]) tag_fn = len(tag_indx_origin) - tag_tp other_indx_origin = numpy.where(Y_ref != statecode)[0] tag_fp = len(numpy.where(numpy.in1d(other_indx_origin, tag_indx_pred))[0]) tag_tn = len(other_indx_origin) - tag_fp tagslevel_performance[statecode] = numpy.array([[tag_tp, tag_fp], [tag_fn, tag_tn]]) return(tagslevel_performance)
if __name__ == "__main__": pass