This notebook presents Python implementations of sequence alignment algorithms, as outlined in chapter 10 of Bioinformatics Algortithms by Pevzner and Compeau. With accompanying sample problems from Rosalind (BA10 problems).
This notebook covers the basic outline of HMMs for sequence analysis and how to compute the probabilities of emitted strings and hidden paths, given an existing HMM.
Creating profile HMMs, Viterbi learning, parameter estimation, Forward-Backward algorithm, and Baum-Welch learning are outlined in the next notebook.
http://rosalind.info/problems/ba10a/
Use the product rule for state transitions to calculate the probability of the hidden path π, given a transition matrix for a Markov chain.
Given: A hidden path $π$ followed by the states States and transition matrix Transition of an HMM (Σ, States, Transition, Emission). (There is no emission matrix in this example; just a Markov chain of states)
Return: The probability of this path, $Pr(π)$. You may assume that initial probabilities are equal.
Sample data:
AABBBAABABAAAABBBBAABBABABBBAABBAAAABABAABBABABBAB
--------
A B
--------
A B
A 0.377 0.623
B 0.26 0.74
Sample Output
5.01732865318e-19
import numpy as np
## calculate proability using the 'product rule' for each transition
def prob_hiddenpath(hidden_path, T_matrix, states):
"""Calculates the probability of a hiddenpath given a T_matrix HMM"""
# Hidden path symbols: encode as integers corresponding to their E, T matrix indices
hidden_path = [int(states.index(state)) for state in hidden_path]
# assuming all transitions from the initial state occur with equal probability.
prob = 0.5
prev = hidden_path[0]
for state in hidden_path[1:]:
trans_pr = T_matrix[prev][state]
prob = prob * trans_pr
prev = state
return prob
pi = 'AABBBAABABAAAABBBBAABBABABBBAABBAAAABABAABBABABBAB'
states = ['A', 'B']
T_matrix = [[0.194, 0.806], [0.273, 0.727]]
prob_pi = prob_hiddenpath(pi, T_matrix, states)
print("probability of the hidden path π:", prob_pi)
Assuming we already now the states of the hidden path, finding the probability of the emitted string simply requires using the product rule on the emission probabilities.
Sample data:
zzzyxyyzzx # emission string x
--------
x y z # emission alphabet
--------
BAAAAAAAAA # hiddenpath pi
--------
A B # hidden States
--------
x y z
A 0.176 0.596 0.228
B 0.225 0.572 0.203 # Emission matrix
Sample Output:
3.59748954746e-06
def p_emissions(emissions, alphabet, hidden_path, states, e_matrix):
""" calculate probability of this emission string given that hidden_path and Emission matrix """
# encode emission and state symbols as integers
hidden_path = [int(states.index(state)) for state in hidden_path]
emissions = [int(alphabet.index(em)) for em in emissions]
probability = 1
# using 'product rule'
for i in range(len(emissions)):
probability = probability * E_matrix[hidden_path[i]][emissions[i]]
return probability
emissions = "zzzyxyyzzx"
alphabet = ['x','y','z']
pi = "BAAAAAAAAA"
states = ['A','B']
E_matrix = np.array([[0.176, 0.596, 0.228], [0.225, 0.572, 0.203]])
print('\nProbability of emitted string, given HMM and hidden path')
p_emissions(emissions, alphabet, pi, states, E_matrix)
Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States, transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission).
Return: A path that maximizes the (unconditional) probability Pr(x, π) over all possible paths π.
Sample Dataset
xyxzzxyxyy
--------
x y z
--------
A B
--------
A B
A 0.641 0.359
B 0.729 0.271
--------
x y z
A 0.117 0.691 0.192
B 0.097 0.42 0.483
Sample Output:
AAABBAAAAA
Algorithm
import numpy as np
def viterbi_algorithm(emission, T, E, states, alphabet):
"""returns max likelihood hidden path for emission string, given HMM"""
S = len(states)
n = len(emission)
# initialize Viterbi graph and backpointers matrices
viterbi = np.ones(shape = (S, n)) * -float('inf')
pointers = [[False for e in range(n)] for s in range(S)]
# initialize first column of viterbi with: weight(node) = (Pr_emission)*(1/States)
for state in range(S):
viterbi[state][0] = np.log(1/S) + E[state][emission[0]]
pointers[state][0] = -1
# Fill viterbi graph using dynamic programming
for i in range(1,n):
for state in range(S):
for prev in range(S):
p_total = E[state][emission[i]] + T[prev][state] + viterbi[prev][i-1]
# find max-weight path to current node
if p_total > viterbi[state][i]:
viterbi[state][i] = p_total
pointers[state][i] = prev
# start backtrack from max-likelihood state in last column of viterbi graph
score = -float('inf')
for state in range(S):
if viterbi[state][n-1] > score:
last = state
score = viterbi[state][n-1]
path = [last]
# backtrack to recreate max likelihood hidden_path in reverse
i = n-1
while i > 0:
next = pointers[last][i]
path.append(next)
last = next
i -= 1
# reverse string to get hidden_path solution to decoding problem
return ''.join(str(states[state]) for state in path[::-1])
## Parse Rosalind HMMs input data
## Use log-transform to prevent underflow
def parse_HMM(lines):
emission = lines[0].strip()
alphabet = lines[2].strip().split()
emission = [int(alphabet.index(em)) for em in emission]
states = lines[4].strip().split()
S = len(states)
T = np.array([line.split()[1:] for line in lines[7:7+S]], float)
T = np.log(T)
E = np.array([line.split()[1:] for line in lines[9+S:]], float)
E = np.log(E)
return(emission, alphabet, states, T, E)
## Sample data1:
with open("./data/10c_test.txt") as f:
lines = [line.strip() for line in f]
emission, alphabet, states, T, E = parse_HMM(lines)
decoded_pi = viterbi_algorithm(emission, T, E, states, alphabet)
print('Emitted:', emission,'\nAlphabet', alphabet,'States',states,
'\nE_matrix\n', E, '\nT_matrix\n', T)
print('\nDecoded path π =', decoded_pi)
What is the likelihood of a particular emitted string from any hidden path, given an HMM?
Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States, transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission).
Return: The probability Pr(x) that the HMM emits x.
Sample Dataset
xzyyzzyzyy
--------
x y z
--------
A B
--------
A B
A 0.303 0.697
B 0.831 0.169
--------
x y z
A 0.533 0.065 0.402
B 0.342 0.334 0.324
Sample Output
1.1005510319694847e-06
## Algorithm
def outcome_likelihood(emission, T, E, states): # outcome-likelihood of HMM emitting emissions (sum of all hidden paths)
"""returns likelihood of emission string, given HMM"""
S = len(states)
n = len(emission)
viterbi = np.zeros(shape = (S, n))
# init first column of viterbi with Pr_emission & 1/States
for state in range(S):
viterbi[state][0] = 1/S * E[state][emission[0]]
# Fill viterbi graph with sums over all incoming edges for ea node
for i in range(1,n):
for state in range(S):
em = E[state][emission[i]]
for prev in range(S):
trans = T[prev][state]
viterbi[state][i] += trans * em * viterbi[prev][i-1]
return sum(viterbi[s][n-1] for s in range(S))
with open("./data/10d_test.txt") as f:
lines = [line.strip() for line in f]
emission, alphabet, states, T, E = parse_HMM(lines)
T = np.exp(T)
E = np.exp(E)
print('string:', emission,'\nstates',states,'\n\nE_matrix\n',E, '\n\nT_matrix\n', T)
outcome_likelihood(emission, T, E, states)