Source code for pyseqlab.linear_chain_crf

'''
@author: ahmed allam <ahmed.allam@yale.edu>

'''

import os
from copy import deepcopy
from collections import OrderedDict
import numpy
from .utilities import ReaderWriter, create_directory, vectorized_logsumexp

[docs]class LCRFModelRepresentation(object): """Model representation that will hold data structures to be used in :class:`LCRF` class Attributes: modelfeatures: set of features defining the model modelfeatures_codebook: dictionary mapping each features in :attr:`modelfeatures` to a unique code Y_codebook: dictionary mapping the set of states (i.e. tags) to a unique code each L: length of longest segment Z_codebook: dictionary for the set Z, mapping each element to unique number/code Z_len: dictionary comprising the length of each element in :attr:`Z_codebook` Z_elems: dictionary comprising the composing elements of each member in the Z set (:attr:`Z_codebook`) Z_numchar: dictionary comprising the number of characters of each member in the Z set (:attr:`Z_codebook`) patts_len: set of lengths extracted from :attr:`Z_len` (i.e. set(Z_len.values())) max_patts_len: maximum pattern length used in the model modelfeatures_inverted: inverted model features (i.e inverting the :attr:`modelfeatures` dictionary) ypatt_features: state features (i.e. y pattern features) that depend only on the states ypatt_activestates: possible/potential activated y patterns/features using the observation features num_features: total number of features in the model num_states: total number of states in the model """ def __init__(self): self.modelfeatures = None self.modelfeatures_codebook = None self.Y_codebook = None self.L = None self.Z_codebook = None self.Z_len = None self.Z_elems = None self.Z_numchar= None self.patts_len = None self.max_patt_len = None self.modelfeatures_inverted = None self.ypatt_features = None self.ypatt_activestates = None self.num_features = None self.num_states = None
[docs] def setup_model(self, modelfeatures, states, L): """setup and create the model representation Creates all maps and codebooks needed by the :class:`LCRF` class Args: modelfeatures: set of features defining the model states: set of states (i.e. tags) L: length of longest segment """ self.modelfeatures = modelfeatures self.modelfeatures_codebook = self.get_modelfeatures_codebook() self.Y_codebook = self.get_modelstates_codebook(states) self.L = L self.generate_instance_properties()
[docs] def generate_instance_properties(self): """generate instance properties that will be later used by :class:`LCRF` class """ self.Z_codebook = self.get_Z_pattern() self.Z_len, self.Z_elems, self.Z_numchar = self.get_Z_info() self.patts_len = set(self.Z_len.values()) self.max_patt_len = max(self.patts_len) self.modelfeatures_inverted, self.ypatt_features = self.get_inverted_modelfeatures() self.ypatt_activestates = self.find_activated_states(self.ypatt_features, self.patts_len) self.num_features = self.get_num_features() self.num_states = self.get_num_states()
[docs] def get_modelfeatures_codebook(self): r"""setup model features codebook it flatten :attr:`modelfeatures` and map each element to a unique code :attr:`modelfeatures` are represented in a dictionary with this form:: {y_patt_1:{featureA:value, featureB:value, ...} y_patt_2:{featureA:value, featureC:value, ...}} Example:: modelfeatures: {'B-PP': Counter({'w[0]=at': 1, 'w[0]=by': 1, 'w[0]=for': 4, ... }), 'B-PP|B-NP': Counter({'w[0]=16': 1, 'w[0]=July': 1, 'w[0]=Nomura': 1, ... }), ... } modelfeatures_codebook: {('B-PP','w[0]=at'): 1, ('B-PP','w[0]=by'): 2, ('B-PP','w[0]=for'): 3, ... } """ modelfeatures = self.modelfeatures codebook = {} code = 0 for y_patt, featuresum in modelfeatures.items(): for feature in featuresum: #fkey = y_patt + "&&" + feature codebook[(y_patt, feature)] = code code += 1 return(codebook)
[docs] def get_modelstates_codebook(self, states): """create states codebook by mapping each state to a unique code/number Args: states: set of tags identified in training sequences Example:: states = {'B-PP', 'B-NP', ...} states_codebook = {'B-PP':1, 'B-NP':2 ...} """ return({s:i for (i, s) in enumerate(states)})
[docs] def get_Z_pattern(self): """create a codebook from set Z by mapping each element to unique number/code Z is set of y patterns used in the model features Example:: Z = {'O|B-VP|B-NP', 'O|B-VP', 'O', 'B-VP', 'B-NP', ...} Z_codebook = {'O|B-VP|B-NP':1, 'O|B-VP':2, 'O':3, 'B-VP':5, 'B-NP':4, ...} """ modelfeatures = self.modelfeatures Z_codebook = {y_patt:index for index, y_patt in enumerate(modelfeatures)} return(Z_codebook)
[docs] def get_Z_info(self): """get the properties of Z set """ Z_codebook = self.Z_codebook Z_len = {} Z_elems = {} Z_numchar = {} for z in Z_codebook: elems = z.split("|") Z_len[z] = len(elems) Z_elems[z] = elems Z_numchar[z] = len(z) return(Z_len, Z_elems, Z_numchar)
[docs] def get_inverted_modelfeatures(self): r"""invert :attr:`modelfeatures` instance variable Example:: modelfeatures_inverted = {'w[0]=take': {1: {'I-VP'}, 2: {'I-VP|I-VP'}, 3: {'I-VP|I-VP|I-VP'}}, 'w[0]=the': {1: {'B-NP'}, 2: {'B-PP|B-NP', 'I-VP|B-NP'}, 3: {'B-NP|B-PP|B-NP', B-VP|I-VP|B-NP', ...} }, ... } ypatt_features = {'B-NP', 'B-PP|B-NP', ..} """ modelfeatures = self.modelfeatures Z_len = self.Z_len inverted_features = {} ypatt_features = set() for y_patt, featuredict in modelfeatures.items(): z_len = Z_len[y_patt] # get features that are based only on y_patts if(y_patt in featuredict): ypatt_features.add(y_patt) for feature in featuredict: if(feature in inverted_features): if(z_len in inverted_features[feature]): inverted_features[feature][z_len].add(y_patt) else: s = set() s.add(y_patt) inverted_features[feature][z_len] = s else: s = set() s.add(y_patt) inverted_features[feature] = {z_len:s} return(inverted_features, ypatt_features)
[docs] def keep_longest_elems(self, s): """used to figure out longest suffix and prefix on sets """ longest_elems = {} for tup, l in s.items(): longest_elems[tup] = max(l, key = len) return(longest_elems)
[docs] def check_suffix(self, token, ref_str): # check if ref_str ends with the token # return(ref_str[len(ref_str)-len(token):] == token) return(ref_str.endswith(token))
[docs] def check_prefix(self, token, ref_str): # check if ref_str starts with a token # return(ref_str[:len(token)] == token) return(ref_str.startswith(token))
[docs] def get_num_features(self): """return total number of features in the model """ return(len(self.modelfeatures_codebook))
[docs] def get_num_states(self): """return total number of states identified by the model in the training set """ return(len(self.Y_codebook))
[docs] def represent_globalfeatures(self, seq_featuresum): """represent features extracted from sequences using :attr:`modelfeatures_codebook` Args: seq_featuresum: dictionary of sequence global features representing F(X,Y) """ modelfeatures_codebook = self.modelfeatures_codebook windx_fval = {} for y_patt, seg_features in seq_featuresum.items(): for featurename in seg_features: #fkey = y_patt + "&&" + featurename fkey = (y_patt, featurename) if(fkey in modelfeatures_codebook): windx_fval[modelfeatures_codebook[fkey]] = seg_features[featurename] count = len(windx_fval) return(numpy.fromiter(windx_fval.keys(), numpy.uint32, count), numpy.fromiter(windx_fval.values(), numpy.float64, count))
[docs] def represent_activefeatures(self, activefeatures): windx_fval = {} for z_patt in activefeatures: count = len(activefeatures[z_patt]) windx_fval[z_patt] = (numpy.fromiter(activefeatures[z_patt].keys(), numpy.uint32, count), numpy.fromiter(activefeatures[z_patt].values(), numpy.float64, count)) return(windx_fval)
[docs] def accumulate_activefeatures(self, activefeatures, accumfeatures): for z_patt in activefeatures: if(z_patt in accumfeatures): accumfeatures[z_patt].update(activefeatures[z_patt]) else: accumfeatures[z_patt] = activefeatures[z_patt]
[docs] def join_segfeatures_filteredstates(self, seg_features, filtered_states): """represent detected active features while parsing sequences Args: activestates: dictionary of the form {'patt_len':{patt_1, patt_2, ...}} seg_features: dictionary of the observation features. It has the form {featureA_name:value, featureB_name:value, ...} """ modelfeatures_codebook = self.modelfeatures_codebook activefeatures = {} for z_len in filtered_states: z_patt_set = filtered_states[z_len] for z_patt in z_patt_set: windx_fval = {} for seg_featurename in seg_features: fkey = (z_patt, seg_featurename) # print("filtering ...") # print("zpatt ", z_patt) # print("fkey ", fkey) if(fkey in modelfeatures_codebook): windx_fval[modelfeatures_codebook[fkey]] = seg_features[seg_featurename] if(windx_fval): activefeatures[z_patt] = windx_fval return(activefeatures)
[docs] def represent_ypatt_filteredstates(self, filtered_states): """represent detected active features while parsing sequences Args: activestates: dictionary of the form {'patt_len':{patt_1, patt_2, ...}} seg_features: dictionary of the observation features. It has the form {featureA_name:value, featureB_name:value, ...} """ modelfeatures = self.modelfeatures modelfeatures_codebook = self.modelfeatures_codebook activefeatures = {} for z_len in filtered_states: z_patt_set = filtered_states[z_len] for z_patt in z_patt_set: windx_fval = {} if(z_patt in modelfeatures[z_patt]): fkey = (z_patt, z_patt) # print("filtering ...") # print("zpatt ", z_patt) # print("fkey ", fkey) windx_fval[modelfeatures_codebook[fkey]] = 1 if(windx_fval): activefeatures[z_patt] = windx_fval return(activefeatures)
[docs] def find_seg_activefeatures(self, seg_features, allowed_z_len): """finds active features based on the observation/segment features Args: seg_features: allowed_z_len: """ modelfeatures_codebook = self.modelfeatures_codebook modelfeatures_inverted = self.modelfeatures_inverted activefeatures = {} # use segment features plus the activated states for seg_featurename in seg_features: if(seg_featurename in modelfeatures_inverted): for z_len in allowed_z_len: if(z_len in modelfeatures_inverted[seg_featurename]): for zpatt in modelfeatures_inverted[seg_featurename][z_len]: fkey = (zpatt, seg_featurename) # print("zpatt ", zpatt) # print("fkey ", fkey) if(zpatt in activefeatures): activefeatures[zpatt][modelfeatures_codebook[fkey]] = seg_features[seg_featurename] else: activefeatures[zpatt] = {modelfeatures_codebook[fkey]:seg_features[seg_featurename]} return(activefeatures)
[docs] def find_ypatt_activefeatures(self, allowed_z_len): """finds the label and state transition features (if applicable -- in case it is modeled) Args: allowed_z_len: """ modelfeatures_codebook = self.modelfeatures_codebook ypatt_activestates = self.ypatt_activestates activefeatures = {} # check if ypattern features are modeled for z_len in allowed_z_len: if(z_len in ypatt_activestates): for zpatt in ypatt_activestates[z_len]: fkey = (zpatt, zpatt) # print("zpatt ", zpatt) # print("fkey ", fkey) if(zpatt in activefeatures): activefeatures[zpatt][modelfeatures_codebook[fkey]] = 1 else: activefeatures[zpatt] = {modelfeatures_codebook[fkey]:1} return(activefeatures)
[docs] def find_activated_states(self, seg_features, allowed_z_len): """identify possible activated y patterns/features using the observation features Args: seg_features: dictionary of the observation features. It has the form {featureA_name:value, featureB_name:value, ...} allowed_z_len: set of permissible order/length of y features {1,2,3} -> means up to third order y features are allowed """ modelfeatures_inverted = self.modelfeatures_inverted active_states = {} for feature in seg_features: if(feature in modelfeatures_inverted): factivestates = modelfeatures_inverted[feature] for z_len in factivestates: if(z_len in allowed_z_len): if(z_len in active_states): active_states[z_len].update(factivestates[z_len]) else: active_states[z_len] = set(factivestates[z_len]) #print("active_states from func ", active_states) return(active_states)
[docs] def filter_activated_states(self, activated_states, accum_active_states, boundary): """filter/prune states and y features Args: activaed_states: dictionary containing possible active states/y features it has the form {patt_len:{patt_1, patt_2, ...}} accum_active_states: dictionary of only possible active states by position it has the form {pos_1:{state_1, state_2, ...}} boundary: tuple (u,v) representing the current boundary in the sequence """ Z_elems = self.Z_elems filtered_activestates = {} __, pos = boundary for z_len in activated_states: if(z_len == 1): continue start_pos = pos - z_len + 1 if((start_pos, start_pos) in accum_active_states): filtered_activestates[z_len] = set() for z_patt in activated_states[z_len]: check = True zelems = Z_elems[z_patt] for i in range(z_len): pos_bound = (start_pos+i, start_pos+i) if(pos_bound not in accum_active_states): check = False break if(zelems[i] not in accum_active_states[pos_bound]): check = False break if(check): filtered_activestates[z_len].add(z_patt) return(filtered_activestates)
[docs] def save(self, folder_dir): """save main model data structures """ model_info = {'MR_modelfeatures':self.modelfeatures, 'MR_modelfeaturescodebook':self.modelfeatures_codebook, 'MR_Ycodebook':self.Y_codebook, 'MR_L':self.L } for name in model_info: ReaderWriter.dump_data(model_info[name], os.path.join(folder_dir, name))
[docs]class LCRF(object): """linear chain CRF model Args: model: an instance of :class:`LCRFModelRepresentation` class seqs_representer: an instance of :class:`SeqsRepresenter` class seqs_info: dictionary holding sequences info Keyword Args: load_info_fromdisk: integer from 0 to 5 specifying number of cached data to be kept in memory. 0 means keep everything while 5 means load everything from disk Attributes: model: an instance of :class:`LCRFModelRepresentation` class weights: a numpy vector representing feature weights seqs_representer: an instance of :class:`SeqsRepresenter` class seqs_info: dictionary holding sequences info beam_size: determines the size of the beam for state pruning fun_dict: a function map def_cached_entities: a list of the names of cached entities sorted (descending) based on estimated space required in memory """ def __init__(self, model, seqs_representer, seqs_info, load_info_fromdisk = 5): self.model = model self.weights = numpy.zeros(model.num_features, dtype= "longdouble") self.seqs_representer = seqs_representer self.seqs_info = seqs_info self.func_dict = {"alpha": self._load_alpha, "beta": self._load_beta, "activated_states": self.load_activatedstates, "seg_features": self.load_segfeatures, "globalfeatures": self.load_globalfeatures, "globalfeatures_per_boundary": self.load_globalfeatures, "activefeatures": self.load_activefeatures, "Y":self._load_Y} self.def_cached_entities = self.cached_entitites(load_info_fromdisk) # default beam size covers all available states self.beam_size = len(self.model.Y_codebook)
[docs] def cached_entitites(self, load_info_fromdisk): """construct list of names of cached entities in memory """ ondisk_info = ["activefeatures", "seg_features", "activated_states", "globalfeatures_per_boundary", "globalfeatures", "Y"] def_cached_entities = ondisk_info[:load_info_fromdisk] return(def_cached_entities)
@property def seqs_info(self): return self._seqs_info @seqs_info.setter def seqs_info(self, info_dict): # make a copy of the passed seqs_info dictionary self._seqs_info = deepcopy(info_dict)
[docs] def identify_activefeatures(self, seq_id, boundary, accum_activestates, apply_filter = True): """determine model active features for a given sequence at defined boundary Main task: - determine model active features in a given boundary - update the accum_activestates dictionary Args: seq_id: integer representing unique id assigned to the sequence boundary: tuple (u,v) defining the boundary under consideration accum_activestates: dictionary of the form {(u,v):{state_1, state_2, ...}} it keeps track of the active states in each boundary """ model = self.model max_patt_len = model.max_patt_len patts_len = model.patts_len ypatt_features = model.ypatt_features # default length of a state/tag state_len = 1 seg_features = self.seqs_info[seq_id]['seg_features'][boundary] start_state_flag = False if('__START__' in model.Y_codebook): # first order model is used with max_patt_len = 2 start_state_flag = True apply_filter = True u, __ = boundary if(u == 1 and start_state_flag): accum_activestates[0,0] = {'__START__'} # print("boundary ", boundary) # print('seg_features ', seg_features) # print("accum_activestates ", accum_activestates) if(u < max_patt_len): # case when we use first-order CRF model -- max_patt_len = 2 if(start_state_flag): max_len = max_patt_len else: max_len = u else: max_len = max_patt_len # determine allowed z patterns length (i.e. pattern order) allowed_z_len = {z_len for z_len in patts_len if z_len <= max_len} # print("apply filter ", apply_filter) if(not apply_filter): # case of no filtering seg_activefeatures = model.find_seg_activefeatures(seg_features, allowed_z_len) ypatt_activefeatures = model.find_ypatt_activefeatures(allowed_z_len) # combine both accumfeatures = seg_activefeatures model.accumulate_activefeatures(ypatt_activefeatures, accumfeatures) else: # case of filtering seg_activefeatures = model.find_seg_activefeatures(seg_features, {state_len}) ypatt_activefeatures = model.find_ypatt_activefeatures({state_len}) # determine activate states with order 0 (i.e. length =1) zero_order_activatedstates = set(seg_activefeatures.keys()) zero_order_activatedstates.update(set(ypatt_activefeatures.keys())) accum_activestates[boundary] = zero_order_activatedstates # remove states with zero order (i.e. length = 1) allowed_z_len.remove(state_len) seg_activated_states = model.find_activated_states(seg_features, allowed_z_len) seg_filtered_states = model.filter_activated_states(seg_activated_states, accum_activestates, boundary) seg_activefeatures_addendum = model.join_segfeatures_filteredstates(seg_features, seg_filtered_states) ypatt_activated_states = model.find_activated_states(ypatt_features, allowed_z_len) ypatt_filtered_states = model.filter_activated_states(ypatt_activated_states, accum_activestates, boundary) ypatt_activefeatures_addendum = model.represent_ypatt_filteredstates(ypatt_filtered_states) # join all the active features accumfeatures = seg_activefeatures model.accumulate_activefeatures(ypatt_activefeatures, accumfeatures) model.accumulate_activefeatures(seg_activefeatures_addendum, accumfeatures) model.accumulate_activefeatures(ypatt_activefeatures_addendum, accumfeatures) activefeatures = model.represent_activefeatures(accumfeatures) return(activefeatures)
[docs] def generate_activefeatures(self, seq_id): """construct a dictionary of model active features identified given a sequence Main task: - generate active features for every boundary of the sequence Args: seq_id: integer representing unique id assigned to the sequence """ # to be used when using gradient-based methods for learning T = self.seqs_info[seq_id]["T"] L = self.model.L accum_activestates = {} activefeatures_perboundary = {} ypatt_activestates = self.model.ypatt_activestates # zero-order state/tag has state_len = 1 (i.e. using only one state) state_len = 1 apply_filter = True # check if we are modeling label bias terms or having categorical features if(state_len in ypatt_activestates or self.seqs_representer.attr_scaler): apply_filter = False for j in range(1, T+1): for d in range(L): u = j - d if(u <= 0): break v = j boundary = (u, v) # identify active features active_features = self.identify_activefeatures(seq_id, boundary, accum_activestates, apply_filter=apply_filter) activefeatures_perboundary[boundary] = active_features return(activefeatures_perboundary)
[docs] def compute_forward_vec(self, w, seq_id): """compute the forward matrix (alpha matrix) Args: w: weight vector (numpy vector) seq_id: integer representing unique id assigned to the sequence .. warning:: implementation of this method is in the child class """ # to be implemented in the child class pass
[docs] def compute_backward_vec(self, w, seq_id): """compute the backward matrix (beta matrix) Args: w: weight vector (numpy vector) seq_id: integer representing unique id assigned to the sequence .. warning:: implementation of this method is in the child class """ # to be implemented in the child class pass
[docs] def compute_marginals(self, seq_id): """compute the marginal (i.e. probability of each y pattern at each position) Args: seq_id: integer representing unique id assigned to the sequence .. warning:: implementation of this method is in the child class """ # to be implemented in the child class pass
[docs] def compute_feature_expectation(self, seq_id, P_marginals): """compute the features expectations (i.e. expected count of the feature based on learned model) Args: seq_id: integer representing unique id assigned to the sequence P_marginals: probability matrix for y patterns at each position in time .. warning:: implementation of this method is in the child class """ # to be implemented in the child class pass
[docs] def compute_seq_loglikelihood(self, w, seq_id): """computes the conditional log-likelihood of a sequence (i.e. :math:`p(Y|X;w)`) it is used as a cost function for the single sequence when trying to estimate parameters w Args: w: weight vector (numpy vector) seq_id: integer representing unique id assigned to the sequence """ # print("-"*40) # print("... Evaluating compute_seq_loglikelihood() ...") # we need global features and alpha matrix to be ready -- order is important l = OrderedDict() l['globalfeatures'] = (seq_id, False) l['activefeatures'] = (seq_id, ) l['alpha'] = (w, seq_id) self.check_cached_info(seq_id, l) # get the p(X;w) -- probability of the sequence under parameter w Z = self.seqs_info[seq_id]["Z"] w_indx, f_val = self.seqs_info[seq_id]["globalfeatures"] # log(p(Y|X;w)) loglikelihood = numpy.dot(w[w_indx], f_val) - Z self.seqs_info[seq_id]["loglikelihood"] = loglikelihood return(loglikelihood)
[docs] def compute_seq_gradient(self, w, seq_id, grad): r"""compute the gradient of conditional log-likelihood with respect to the parameters vector w (:math:`\frac{\partial p(Y|X;w)}{\partial w}`) Args: w: weight vector (numpy vector) seq_id: integer representing unique id assigned to the sequence """ # print("-"*40) # print("... Evaluating compute_seq_gradient() ...") # we need alpha, beta, global features and active features to be ready l = OrderedDict() l['globalfeatures'] = (seq_id, False) l['activefeatures'] = (seq_id, ) l['alpha'] = (w, seq_id) l['beta'] = (w, seq_id) self.check_cached_info(seq_id, l) # compute marginal probability of y patterns at every position P_marginal = self.compute_marginals(seq_id) # compute features expectation self.compute_feature_expectation(seq_id, P_marginal, grad) target_indx = numpy.where(grad!=0)[0] # get global features count of the reference sequence gwindx, gfval = self.seqs_info[seq_id]["globalfeatures"] grad[target_indx] *= -1 grad[gwindx] += gfval # update target_indx target_indx = numpy.unique(numpy.concatenate((target_indx, gwindx))) #target_indx = numpy.where(grad!=0)[0] return(target_indx)
[docs] def compute_seqs_loglikelihood(self, w, seqs_id): """computes the conditional log-likelihood of training sequences it is used as a cost/objective function for the whole training sequences when trying to estimate parameters w Args: w: weight vector (numpy vector) seqs_id: list of integer representing unique ids of sequences used for training """ seqs_loglikelihood = 0 for seq_id in seqs_id: seqs_loglikelihood += self.compute_seq_loglikelihood(w, seq_id) return(seqs_loglikelihood)
[docs] def compute_seqs_gradient(self, w, seqs_id): """compute the gradient of conditional log-likelihood with respect to the parameters vector w Args: w: weight vector (numpy vector) seqs_id: list of integer representing unique ids of sequences used for training """ seqs_grad = numpy.zeros(len(w)) seq_grad = numpy.zeros(len(w)) for seq_id in seqs_id: target_indx = self.compute_seq_gradient(w, seq_id, seq_grad) seqs_grad[target_indx] += seq_grad[target_indx] seq_grad.fill(0) return(seqs_grad)
def _load_alpha(self, w, seq_id): """compute and load the alpha matrix in :attr:`seqs_info` Args: w: weight vector (numpy vector) seq_id: integer representing unique id assigned to the sequence .. note:: - seg_features (per boundary) dictionary should be available in :attr:`seqs.info` - activated_states (per boundary) dictionary should be available in :attr:`seqs.info` """ seq_info = self.seqs_info[seq_id] seq_info["alpha"] = self.compute_forward_vec(w, seq_id) seq_info["Z"] = vectorized_logsumexp(seq_info["alpha"][-1,:]) #print("... Computing alpha probability ...") def _load_beta(self, w, seq_id): """compute and load the beta matrix in :attr:`seqs_info` Args: w: weight vector (numpy vector) seq_id: integer representing unique id assigned to the sequence .. note:: - fpotential per boundary dictionary should be available in :attr:`seqs.info` """ seq_info = self.seqs_info[seq_id] seq_info["beta"] = self.compute_backward_vec(w, seq_id) #print("... Computing beta probability ...") def _load_Y(self, seq_id): """load the Y sequence and the boundaries in :attr:`seqs_info` Args: seq_id: integer representing unique id assigned to the sequence """ seq = self._load_seq(seq_id, target="seq") self.seqs_info[seq_id]['Y'] = {'flat_y':seq.flat_y, 'boundaries':seq.y_sboundaries} #print("... loading Y ...")
[docs] def load_activatedstates(self, seq_id): """load sequence activated states in :attr:`seqs_info` Args: seq_id: integer representing unique id assigned to the sequence """ seqs_info = self.seqs_info seqs_representer = self.seqs_representer activated_states = seqs_representer.get_seq_activatedstates(seq_id, seqs_info) seqs_info[seq_id]["activated_states"] = activated_states
#print("... loading activated states ...")
[docs] def load_segfeatures(self, seq_id): """load sequence observation features in :attr:`seqs_info` Args: seq_id: integer representing unique id assigned to the sequence """ seqs_info = self.seqs_info seqs_representer = self.seqs_representer seg_features = seqs_representer.get_seq_segfeatures(seq_id, seqs_info) self.seqs_info[seq_id]["seg_features"] = seg_features
#print("... loading segment features ...")
[docs] def load_activefeatures(self, seq_id): """load sequence model identified active features in :attr:`seqs_info` Args: seq_id: integer representing unique id assigned to the sequence """ seqs_representer = self.seqs_representer activefeatures = seqs_representer.get_seq_activefeatures(seq_id, self.seqs_info) if(not activefeatures): # check if activated_states and seg_features are loaded l = {} #l['activated_states'] = (seq_id, ) l['seg_features'] = (seq_id, ) self.check_cached_info(seq_id, l) activefeatures = self.generate_activefeatures(seq_id) seq_dir = self.seqs_info[seq_id]['activefeatures_dir'] ReaderWriter.dump_data(activefeatures, os.path.join(seq_dir, 'activefeatures')) self.seqs_info[seq_id]["activefeatures"] = activefeatures
[docs] def load_globalfeatures(self, seq_id, per_boundary=True): """load sequence global features in :attr:`seqs_info` Args: seq_id: integer representing unique id assigned to the sequence Keyword Args: per_boundary: boolean representing if the required global features dictionary is represented by boundary (i.e. True) or aggregated (i.e. False) """ seqs_representer = self.seqs_representer gfeatures, exception_fired = seqs_representer.get_seq_globalfeatures(seq_id, self.seqs_info, per_boundary=per_boundary) # print("per_boundary ", per_boundary) # print(gfeatures_perboundary) if(per_boundary): fname = "globalfeatures_per_boundary" else: fname = "globalfeatures" if(exception_fired): gfeatures = self.model.represent_globalfeatures(gfeatures) seq_dir = self.seqs_info[seq_id]['globalfeatures_dir'] ReaderWriter.dump_data(gfeatures, os.path.join(seq_dir, 'globalfeatures_repr')) self.seqs_info[seq_id][fname] = gfeatures
# print(self.seqs_info[seq_id][fname]) #print("loading globalfeatures")
[docs] def load_imposter_globalfeatures(self, seq_id, y_imposter, seg_other_symbol): """load imposter sequence global features in :attr:`seqs_info` Args: seq_id: integer representing unique id assigned to the sequence y_imposter: the imposter sequence generated using viterbi decoder seg_other_sybmol: If it is specified, then the task is a segmentation problem (in this case we need to specify the non-entity/other element) else if it is None (default), then it is considered as sequence labeling problem """ seqs_representer = self.seqs_representer imposter_gfeatures_perboundary, y_imposter_boundaries = seqs_representer.get_imposterseq_globalfeatures(seq_id, self.seqs_info, y_imposter, seg_other_symbol) return(imposter_gfeatures_perboundary, y_imposter_boundaries)
[docs] def represent_globalfeature(self, gfeatures, boundaries): """represent extracted sequence global features two representation could be applied: - (1) features identified by boundary (i.e. f(X,Y)) - (2) features identified and aggregated across all positions in the sequence (i.e. F(X, Y)) Args: gfeatures: dictionary representing the extracted sequence features (i.e F(X, Y)) boundaries: if specified (i.e. list of boundaries), then the required representation is global features per boundary (i.e. option (1)) else (i.e. None or empty list), then the required representation is the aggregated global features (option(2)) """ seqs_representer = self.seqs_representer windx_fval = seqs_representer.represent_gfeatures(gfeatures, self.model, boundaries=boundaries) return(windx_fval)
def _load_seq(self, seq_id, target = "seq"): """load/return components of the sequence which is an instance of :class:`SequenceStruct` Args: seq_id: integer representing unique id assigned to the sequence Keyword Args: target: string from {'seq', 'Y', 'X'} """ seqs_representer = self.seqs_representer seq = seqs_representer.load_seq(seq_id, self.seqs_info) if(target == "seq"): return(seq) elif(target == "Y"): return(seq.Y) elif(target == "X"): return(seq.X)
[docs] def check_cached_info(self, seq_id, entity_names): """check and load required data elements/entities for every computation step Args: seq_id: integer representing unique id assigned to the sequence entity_name: list of names of the data elements need to be loaded in :attr:`seqs.info` dictionary needed while performing computation .. note:: order of elements in the entity_names list is **important** """ seq_info = self.seqs_info[seq_id] func_dict = self.func_dict none_type = type(None) for varname, args in entity_names.items(): if(type(seq_info.get(varname)) == none_type): func_dict[varname](*args)
[docs] def clear_cached_info(self, seqs_id, cached_entities = []): """clear/clean loaded data elements/entities in :attr:`seqs.info` dictionary Args: seqs_id: list of integers representing the unique ids of the training sequences Keyword Args: cached_entities: list of data entities to be cleared for the :attr:`seqs.info` dictionary .. note:: order of elements in the entity_names list is **important** """ args = self.def_cached_entities + cached_entities for seq_id in seqs_id: seq_info = self.seqs_info[seq_id] for varname in args: if(varname in seq_info): seq_info[varname] = None
[docs] def save_model(self, folder_dir): """save model data structures Args: folder_dir: string representing directory where files are pickled/dumped """ # to clean things before pickling the model #print(self.seqs_info) self.seqs_info.clear() self.seqs_representer.save(folder_dir) self.model.save(folder_dir) # save weights ReaderWriter.dump_data(self.weights, os.path.join(folder_dir, "weights")) # write classes used into a file class_desc = [] class_desc.append(str(self.model.__class__).split(".")[-1].split("'")[0]) class_desc.append(str(self.__class__).split(".")[-1].split("'")[0]) class_desc.append(str(self.seqs_representer.__class__).split(".")[-1].split("'")[0]) class_desc.append(str(self.seqs_representer.feature_extractor.__class__).split(".")[-1].split("'")[0]) class_desc.append(str(self.seqs_representer.attr_extractor.__class__).split(".")[-1].split("'")[0]) if(self.seqs_representer.attr_scaler): class_desc.append(str(self.seqs_representer.attr_scaler.__class__).split(".")[-1].split("'")[0]) else: class_desc.append('None') with open(os.path.join(folder_dir, 'class_desc.txt'), 'a') as f: f.write("\n".join(class_desc))
#print('seqs_info from LCRF ', self.seqs_info)
[docs] def decode_seqs(self, decoding_method, out_dir, **kwargs): r"""decode sequences (i.e. infer labels of sequence of observations) Args: decoding_method: a string referring to type of decoding {viterbi, per_state_decoding} out_dir: string representing the working directory (path) where sequence processing will take place Keyword Arguments: file_name: the name of the file in case decoded sequences are required to be written sep: separator (default '\t') between the columns when writing decoded sequences to file procseqs_foldername: string representing the folder name where intermediary data and parsing would take place beam_size: integer determining the size of the beam while decoding seqs: a list comprising of sequences that are instances of :class:`SequenceStruct` class to be decoded (used for decoding test data or any new/unseen data -- sequences) seqs_info: dictionary containing the info about the sequences to decode (used for decoding training sequences) seqs_dict: a dictionary comprising of sequence ids as keys and corresponding sequences that are instances of :class:`SequenceStruct` class to be decoded as values .. note:: for keyword arguments only one of {``seqs`` , ``seqs_info``, ``seqs_dict``} option need to be specified """ w = self.weights if(decoding_method == "perstate_decoding"): decoder = self.perstate_posterior_decoding else: decoder = self.viterbi file_name = kwargs.get('file_name') if(file_name): # file to write the sequences with their predicted labels corpus_fname = "decoding_seqs" out_file = os.path.join(create_directory(corpus_fname, out_dir), file_name) if(kwargs.get("sep")): sep = kwargs['sep'] else: # default separator is tab sep = "\t" beam_size = kwargs.get('beam_size') if(not beam_size): beam_size = self.beam_size unique_id = False procseqs_foldername = kwargs.get('procseqs_foldername') if(not procseqs_foldername): unique_id = True procseqs_foldername = "processed_seqs" if(kwargs.get("seqs_info")): self.seqs_info = kwargs["seqs_info"] N = len(self.seqs_info) else: if(kwargs.get("seqs")): seqs = kwargs["seqs"] seqs_dict = {i+1:seqs[i] for i in range(len(seqs))} elif(kwargs.get("seqs_dict")): seqs_dict = kwargs['seqs_dict'] else: raise('You need to specify one of the following keyword arguments {``seqs`` , ``seqs_info``, ``seqs_dict``} ') seqs_id = list(seqs_dict.keys()) N = len(seqs_id) seqs_info = self.seqs_representer.prepare_seqs(seqs_dict, procseqs_foldername, out_dir, unique_id = unique_id) self.seqs_representer.scale_attributes(seqs_id, seqs_info) self.seqs_representer.extract_seqs_modelactivefeatures(seqs_id, seqs_info, self.model, "processed_seqs", learning=False) self.seqs_info = seqs_info seqs_pred = {} seqs_info = self.seqs_info counter = 0 for seq_id in seqs_info: Y_pred, __ = decoder(w, seq_id, beam_size) seq = ReaderWriter.read_data(os.path.join(seqs_info[seq_id]["globalfeatures_dir"], "sequence")) if(file_name): self.write_decoded_seqs([seq], [Y_pred], out_file, sep) seqs_pred[seq_id] = {'seq': seq,'Y_pred': Y_pred} # clear added info per sequence self.clear_cached_info([seq_id]) counter += 1 print("sequence decoded -- {} sequences are left".format(N-counter)) # clear seqs_info self.seqs_info.clear() return(seqs_pred)
[docs] def write_decoded_seqs(self, ref_seqs, Y_pred_seqs, out_file, sep = "\t"): """write inferred sequences on file Args: ref_seqs: list of sequences that are instances of :class:`SequenceStruct` Y_pred_seqs: list of list of tags decoded for every reference sequence out_file: string representing out file where data is written sep: separator used while writing on out file """ for i in range(len(ref_seqs)): Y_pred_seq = Y_pred_seqs[i] ref_seq = ref_seqs[i] T = ref_seq.T line = "" for t in range(1, T+1): for field_name in ref_seq.X[t]: line += ref_seq.X[t][field_name] + sep if(ref_seq.flat_y): line += ref_seq.flat_y[t-1] + sep line += Y_pred_seq[t-1] line += "\n" line += "\n" ReaderWriter.log_progress(line,out_file)
[docs] def prune_states(self, j, delta, beam_size): """prune states that fall off the specified beam size Args: j: current position (integer) in the sequence delta: score matrix beam_size: specified size of the beam (integer) .. warning:: implementation of this method is in the child class """ pass
[docs] def viterbi(self, w, seq_id, beam_size, stop_off_beam = False, y_ref=[], K=1): """decode sequences using viterbi decoder Args: w: weight vector (numpy vector) seq_id: integer representing unique id assigned to the sequence beam_size: integer representing the size of the beam Keyword Arguments: stop_off_beam: boolean indicating if to stop when the reference state \ falls off the beam (used in perceptron/search based learning) y_ref: reference sequence list of labels (used while learning) K: integer indicating number of decoded sequences required (i.e. top-k list) .. warning:: implementation of this method is in the child class """ pass
[docs] def validate_forward_backward_pass(self, w, seq_id): """check the validity of the forward backward pass Args: w: weight vector (numpy vector) seq_id: integer representing unique id assigned to the sequence """ self.clear_cached_info([seq_id]) # this will compute alpha and beta matrices and save them in seqs_info dict l = OrderedDict() l['activefeatures'] = (seq_id, ) l['alpha'] = (w, seq_id) l['beta'] = (w, seq_id) self.check_cached_info(seq_id, l) alpha = self.seqs_info[seq_id]["alpha"] beta = self.seqs_info[seq_id]["beta"] Z_alpha = vectorized_logsumexp(alpha[-1,:]) Z_beta = numpy.min(beta[1, :]) raw_diff = numpy.abs(Z_alpha - Z_beta) print("alpha[-1,:] = {}".format(alpha[-1,:])) print("beta[1,:] = {}".format(beta[1,:])) print("Z_alpha : {}".format(Z_alpha)) print("Z_beta : {}".format(Z_beta)) print("Z_aplha - Z_beta {}".format(raw_diff)) rel_diff = raw_diff/(Z_alpha + Z_beta) print("rel_diff : {}".format(rel_diff)) self.clear_cached_info([seq_id]) #print("seqs_info {}".format(self.seqs_info)) return((raw_diff, rel_diff))
[docs] def check_gradient(self, w, seq_id): """implementation of finite difference method similar to ``scipy.optimize.check_grad()`` Args: w: weight vector (numpy vector) seq_id: integer representing unique id assigned to the sequence """ print("checking gradient...") self.clear_cached_info([seq_id]) epsilon = 1e-4 w_dim = len(w) w = numpy.random.randn(w_dim) # basis vector ei = numpy.zeros(w_dim, dtype="longdouble") grad = numpy.zeros(w_dim, dtype="longdouble") for i in range(len(w)): ei[i] = epsilon l_wplus = self.compute_seq_loglikelihood(w + ei, seq_id) self.clear_cached_info([seq_id]) l_wminus = self.compute_seq_loglikelihood(w - ei, seq_id) self.clear_cached_info([seq_id]) grad[i] = (l_wplus - l_wminus) / (2*epsilon) ei[i] = 0 estimated_grad = self.compute_seqs_gradient(w, [seq_id]) diff = numpy.abs(-grad + estimated_grad) avg_diff = numpy.mean(diff) print("difference between both gradients: \n {}".format(diff)) print("average difference = {}".format(avg_diff)) # clear seq_id info self.clear_cached_info([seq_id]) return(avg_diff)
[docs] def validate_gradient(self, w, seq_id): print("checking gradient using approach mentioned in (Bottou, 2012) 'Stochastic Gradient Descent Tricks' paper...") self.clear_cached_info([seq_id]) epsilons = [1e-6, 1e-8, 1e-10] rounds = 5 res = {} # generate a random initial weight w for __ in range(rounds): for epsilon in epsilons: w0 = numpy.random.rand(len(w)) l0 = self.compute_seq_loglikelihood(w0, seq_id) self.clear_cached_info([seq_id]) g = self.compute_seqs_gradient(w0, [seq_id]) self.clear_cached_info([seq_id]) delta = -epsilon*g w_prime = w0 + delta l_prime = self.compute_seq_loglikelihood(w_prime, seq_id) # clear seq_id info self.clear_cached_info([seq_id]) # verify that l_prime = l0 + epsilon*g diff = numpy.abs(l0 + numpy.dot(delta, g) - l_prime) if(epsilon in res): res[epsilon].append(diff) else: res[epsilon] = [diff] diff_concat = [] for eps, diff_array in res.items(): print("epsilon = ", eps) print("difference across 5 random initializations of w ", diff_array) diff_concat += diff_array avg_diff = numpy.mean(numpy.asarray(diff_concat)) print("Average gradient difference across all epsilons and initializations is ", avg_diff) return(avg_diff)
[docs] def validate_expected_featuresum(self, w, seqs_id): """validate expected feature computation Args: w: weight vector (numpy vector) seqs_id: list of integers representing unique id assigned to the sequences """ self.clear_cached_info(seqs_id) grad = self.compute_seqs_gradient(w, seqs_id) abs_grad = numpy.abs(grad) avg_diff = numpy.mean(abs_grad) print("difference between empirical feature sum and model's expected feature sum: \n {}".format(avg_diff)) print("average difference is {}".format(avg_diff)) self.clear_cached_info(seqs_id) return(avg_diff)
if __name__ == "__main__": pass