Source code for hansel.hansel

from math import log10

import numpy as np

#NOTE
# Although it is possible to 'fully' subclass numpy's ndarray, the
# accepted practice appears to be instructing users to create an ndarray first
# and then pass it to an object constructor responsible for 'converting' that
# ndarray to one of your own type.
# The benefit appears to be a simpler constructor (it need not emulate the
# actual constructor of ndarray) and I guess we are somewhat less coupled
# to the ndarray interface, too.
# You can read more about subclassing ndarray via the official documentation:
# http://docs.scipy.org/doc/numpy-1.10.1/user/basics.subclassing.html
[docs]class Hansel(np.ndarray): """API to a numpy-backed graph-inspired data structure for determining likely sequences of symbols from breadcrumbs of evidence. Given a numpy array and a list of permitted symbols, Hansel presents a user friendly API to store and operate on a graph-like data structure that can be used to investigate likely sequences of symbols based on observations of pairwise co-occurrence of those symbols in a dimension such as space or time. Parameters ---------- input_arr : 4D numpy array A numpy array (typically initialised with zeros) of size (A, A, B+2, B+2), where `A` is the number of `symbols` + `unsymbols` and `B` are the points in time or space on which pairwise observations between symbols can be observed. symbols : list{str} A list of permitted states or symbols as strings. unsymbols : list{str} A list of permitted states that may represent the known absence of a symbol. Attributes ---------- n_slices(sources) : int The number of distinct sources from which observations were received. n_crumbs(observations) : int The number of pairwise observations that were observed. symbols : list{str} The list of permitted states or symbols. unsymbols : list{str} A list of states that represent an absence of a symbol. is_weighted : boolean Whether or not the underlying numpy array has been modified by the `reweight_observation` function at least once. L : int, optional(default=1) The number of positions back from the current position (inclusive) to consider when calculating conditional probabilities. """ def __new__(cls, input_arr, symbols, unsymbols, L=1): # Force our class on the input_arr #TODO Is there an overhead in casting a view here (are we copying the # big matrix to a new object? :( obj = np.asarray(input_arr).view(cls) obj.is_weighted = False obj.symbols = symbols obj.symbols_d = {symbol: i for i, symbol in enumerate(symbols)} obj.unsymbols = unsymbols obj.n_slices= 0 obj.n_crumbs = 0 obj.L = L obj.valid_symbols = set(symbols) - set(unsymbols) return obj #NOTE Provides support for construction mechanisms of numpy def __array_finalize__(self, obj): #NOTE This is probably quite gross. # I am quite sure that `obj` will never be `None` here (despite docs) # as __new__ now invokes a view cast and thus always sends us to # __array_finalize__ *before* it has had a chance to set variables. # I try and catch us inside a "__new__ view cast" by checking whether # obj is a plain ndarray (which it *should* be if this is new). # Elsewise type(obj) should be hansel.Hansel for a view cast or # from-template on an existing Hansel structure. if obj is None or type(obj) == np.ndarray: return #NOTE Defaults are set here, not in __new__, as it is this method # that actually oversees creation of all objects (__new__) sends # us here anyway... Unfortunately this leaves us no choice but to # provide an impractical default for the symbols list... # View casting or from-template, add the missing info from the # existing object before finalizing self.is_weighted = getattr(obj, 'is_weighted', False) self.n_slices = getattr(obj, 'n_slices', 0) self.n_crumbs = getattr(obj, 'n_crumbs', 0) self.symbols = getattr(obj, 'symbols', []) self.symbols_d = getattr(obj, 'symbols_d', {}) self.unsymbols = getattr(obj, 'unsymbols', []) self.valid_symbols = getattr(obj, 'valid_symbols', set()) self.L = getattr(obj, 'L', 1) #TODO Safer warning? if self.symbols is None or len(self.symbols) == 0: import sys sys.stderr.write("[FAIL] Attempted to allocate Hansel structure without symbols.\n") sys.exit(1) def __orient_positions(self, i, j): if i < j: return i, j else: return j, i @property def sources(self): """An alias for `n_slices`""" return self.n_slices @property def observations(self): """An alias for `n_crumbs`""" return self.n_crumbs
[docs] def add_observation(self, symbol_from, symbol_to, pos_from, pos_to): """Add a pairwise observation to the data structure. Parameters ---------- symbol_from : str The first observed symbol of the pair (in space or time). symbol_to : str The second observed symbol of the pair (in space or time). pos_from : int The "position" at which `symbol_from` was observed. pos_to : int The "position" at which `symbol_to` was observed. """ self.n_crumbs += 1 pos_from, pos_to = self.__orient_positions(pos_from, pos_to) self[self.__symbol_num(symbol_from), self.__symbol_num(symbol_to), pos_from, pos_to] += 1
[docs] def get_observation(self, symbol_from, symbol_to, pos_from, pos_to): """Get the number of co-occurrences of a pair of positioned symbols. Parameters ---------- symbol_from : str The first observed symbol of the pair (in space or time). symbol_to : str The second observed symbol of the pair (in space or time). pos_from : int The "position" at which `symbol_from` was observed. pos_to : int The "position" at which `symbol_to` was observed. Returns ------- Number of observations : float The number of times `symbol_from` was observed at `pos_from` with `symbol_to` at `pos_to` across all sources (slices). .. note:: It is possible for the number of observations returned to be a `float` if :attr:`hansel.hansel.Hansel.is_weighted` is `True`. """ pos_from, pos_to = self.__orient_positions(pos_from, pos_to) return self[self.__symbol_num(symbol_from), self.__symbol_num(symbol_to), pos_from, pos_to]
#TODO Describe the conditions under which reweighting halts in the documentation
[docs] def reweight_observation(self, symbol_from, symbol_to, pos_from, pos_to, ratio): """Alter the number of co-occurrences between a pair of positioned symbols by some ratio. .. note:: This function will set :attr:`hansel.hansel.Hansel.is_weighted` to `True`. Parameters ---------- symbol_from : str The first observed symbol of the pair to be reweighted (in space or time). symbol_to : str The second observed symbol of the pair to be reweighted (in space or time). pos_from : int The "position" at which `symbol_from` was observed. pos_to : int The "position" at which `symbol_to` was observed. ratio : float The ratio by which to subtract the current number of observations. That is, `new_value = old_value - (ratio * old_value)`. """ pos_from, pos_to = self.__orient_positions(pos_from, pos_to) old_v = self[self.__symbol_num(symbol_from), self.__symbol_num(symbol_to), pos_from, pos_to] new_v = old_v - (ratio*old_v) if old_v != 0: if new_v < 1: self[self.__symbol_num(symbol_from), self.__symbol_num(symbol_to), pos_from, pos_to] = 0 return old_v else: self[self.__symbol_num(symbol_from), self.__symbol_num(symbol_to), pos_from, pos_to] = new_v return old_v - new_v return 0.0 #TODO This is a bit gross as we should maybe handle it with Gretel instead """ if self.get_observation(symbol_from, "_", pos_from, pos_from+1) > 0: #print "Reducing support between %d(%s) -> %d(%s) by %.2f (%.2f -> %.2f)" % (pos_from, symbol_from, pos_to, symbol_to, ratio, old_v, new_v) old_v = self[self.__symbol_num(symbol_from)][self.__symbol_num("_")][pos_from][pos_from+1] # Provisional testing seems to indicate this works best on small sets... #new_v = old_v - ((ratio*old_v) / (len( list(set(self.symbols) - set(self.unsymbols)) ) )) new_v = old_v - (ratio*old_v) try: marg_from = self.get_counts_at(pos_from+1) except IndexError: pass else: valid_symbols_seen = 0 for s in list(set(self.symbols) - set(self.unsymbols)): if s in marg_from: valid_symbols_seen += 1 new_v = old_v - ((ratio*old_v)/ valid_symbols_seen) if old_v != 0: if new_v < 1: self[self.__symbol_num(symbol_from)][self.__symbol_num("_")][pos_from][pos_from+1] = 0 else: self[self.__symbol_num(symbol_from)][self.__symbol_num("_")][pos_from][pos_from+1] = new_v """ self.is_weighted = True
def __symbol_num(self, symbol): #TODO Catch potential KeyError #TODO Generic mechanism for casing (considering non-alphabetically named states, too...) return self.symbols_d[symbol] def __symbol_unnum(self, num): #TODO Catch potential IndexError return self.symbols[num] #TODO What about non-whole genes (need a special symbol)
[docs] def get_counts_at(self, at_pos): """Get the counts for each symbol that appears at a given position. Parameters ---------- at_pos : int The "position" for which to return the number of occurrences of each symbol. Returns ------- Symbol counts : dict{str, float} A dictionary whose keys are each of the symbols that were observed at position `at_pos` and a special "total" key. The values are the number of observations of that symbol at `at_pos`. The "total" is the sum of all observation counts. """ marg = {"total": 0} if(at_pos == 0): permitted_a_symbols = self.symbols_d.keys() else: permitted_a_symbols = self.valid_symbols for symbol_a in permitted_a_symbols: obs = 0 for symbol_b in self.symbols_d: obs += self.get_observation(symbol_a, symbol_b, at_pos, at_pos+1) if obs > 0: marg[symbol_a] = obs marg["total"] += obs #print "\t[MARG] %d %s" % (symbol_pos, str(marg)) return marg
[docs] def get_marginal_of_at(self, of_symbol, at_symbol): """Get the marginal distribution of a symbol appearing at a position. Parameters ---------- of_symbol : str The symbol for which to calculate the marginal distribution. at_symbol : int The position at which to calculate the marginal distribution. Returns ------- Marginal probability : float The probability a random symbol drawn from all observations at `at_symbol` being equal to `of_symbol`. Alternatively, the proportion of all symbols observed at `at_symbol` being equal to `of_symbol`. """ marginal = self.get_counts_at(at_symbol) return marginal[of_symbol] / marginal["total"]
[docs] def get_edge_weights_at(self, symbol_pos, current_path): """Get the outgoing weighted edges at some position, given a path to that position. Parameters ---------- symbol_pos : int The index of the current position. current_path : list{str} A list of symbols representing the path of selected symbols that led to the current position, `symbol_pos`. Returns ------- Conditional distribution : dict{str, float} A dictionary whose keys are each of the possible symbols that may be reached from the current position, given the observed path. The values are `log10` conditional probabilities of the next symbol in the path (or sequence) being that of the key. """ # ...work out each probability, for each branch curr_branches = {} for symbol in self.get_counts_at(symbol_pos): if symbol in self.unsymbols or symbol == "total": continue curr_branches[symbol] = 0.0 if symbol_pos > 1: # If the length of the path, without the sentinel is less than L, # we can only inspect the available members of L so far... if len(current_path)-1 < self.L: l_limit = len(current_path)-1 else: l_limit = self.L for l in range(0, l_limit): curr_i = (len(current_path)-1) - l curr_branches[symbol] += log10(self.get_conditional_of_at(current_path[curr_i], symbol, curr_i, symbol_pos)) # Append the marginal of symbol at desired position curr_branches[symbol] += log10(self.get_marginal_of_at(symbol, symbol_pos)) return curr_branches
#TODO Given/predicted is a bit misleading as they turn out to be the "wrong way around"
[docs] def get_conditional_of_at(self, symbol_from, symbol_to, pos_from, pos_to): """Given a symbol and position, calculate the conditional for co-occurrence with another positioned symbol. Parameters ---------- symbol_from : str The first (given) symbol of the pair on which to condition. symbol_to : str The second (predicted) symbol of the pair on which to condition. pos_from : int The "position" at which the given `symbol_from` was observed. pos_to : int The "position" at which the predicted `symbol_to` was observed. Returns ------- Conditional probability : float The conditional probability of `symbol_from` occurring at `pos_from` given observation of a predicted `symbol_to` at `pos_to`. """ marg_from = self.get_counts_at(pos_from) obs = self.get_observation(symbol_from, symbol_to, pos_from, pos_to) total = self.get_spanning_support(symbol_to, pos_from, pos_to) valid_symbols_seen = 0 for s in self.valid_symbols: if s in marg_from: valid_symbols_seen += 1 return self.__estimate_conditional(valid_symbols_seen, obs, total)
#TODO Should this be "number of sources", rather than "number of observations"
[docs] def get_spanning_support(self, symbol_to, pos_from, pos_to): """Get the number of observations that span over two positions of interest, that also feature some symbol. Parameters ---------- symbol_to : str The symbol that should appear at `pos_to`. pos_from : int A position that appears "before" `pos_to` in space or time that must be overlapped by a source to be counted. The symbol at `pos_from` is not relevant. pos_to : int The second position a source must overlap (but not necessarily terminate at) that must be the symbol `symbol_to`. Returns ------- Number of observations : float The number of observations yielded from sources that overlap both `pos_from` and `pos_to`, that also feature `symbol_to` at `pos_to`. .. note:: It is possible for the number of observations returned to be a `float` if :attr:`hansel.hansel.Hansel.is_weighted` is `True`. """ total = 0 for symbol_from in self.valid_symbols: total += self.get_observation(symbol_from, symbol_to, pos_from, pos_to) return total
def __estimate_conditional(self, av, obs, total): return (1 + obs)/float(av + total)