Given a gene/protein of unknown function, we want to see if we can match it with known sequence families through multiple pairwise alignments. However, it can be very difficult to align distantly related sequences if we use a scoring method that is consistent across all positions in the alignment (Smith-Waterman, Needleman-Wunsch).
We can use HMMs to align a new sequence to a multiple alignment with family-specific scoring by determining the match/insertion/deletion states and the emission probabilities for each base (nt/AA).
The profile HMM's emission matrix is similar to scoring matrix (eg. PAM250) that changes at each position, with in/dels also being treated differently by the transition matrix.
The first step is to create a profile HMM from a multiple sequence alignment of the protein/gene family to be queried.
Profile HMM from a multiple sequence alignment (MSA)
First we remove all columns from the MSA with proportion of gaps being greater than some threshold $\theta$ (eg. 0.35) to create a seed alignment (alignment*). Any gaps in the seed alignment will be deletion states, any symbols removed from the non-seed columns will be insertion states. Anytime there is a base in a seed column it will be in a match state.
We will get emission probabilities for all the insertion and match states but the deletion states will have all zero emission probabilities. We will the calculate the proportions of all transitions and emissions from each state to build the HMM
HMM consists of:
Algorithm for Profile HMM(alignment*, theta)
The somewhat tricky part to implement is to discern what hidden state the sequence is transitioning to since it is not enough to specify only M/D/I. (subroutine: find/return the next M/D/I/end state)
The insertion state can be between any two match and/or del states. A single insertion state can occur multiple times if there are multiple bases in the insert (count each emission).
Use matrix algebra for normalization: row = row/sum(row)
Input: A multiple alignment Alignment and a threshold θ. Output: HMM(Alignment, θ), in the form of transition and emission matrices. Code Challenge: Solve the Profile HMM Problem.
Input: A threshold θ, followed by an alphabet Σ, followed by a multiple alignment Alignment whose strings are formed from Σ. Output: The transition matrix followed by the emission matrix of HMM(Alignment, θ). Note: Your matrices can be either space-separated or tab-separated.
import pandas as pd
import numpy as np
### algorithm for building profile HMM(alignment*, theta)
def profileHMM(alignment, alphabet, theta):
# # 'seed' indicates which columns pass threshold theta for align*
def get_seed_alignment(alignment, theta, alphabet):
"""'seed': True if gaps < theta; included in align*
returns seed alignment columns for HMM"""
k = len(alignment[0])
a = len(alphabet.keys())
# print("Length of alignment(k) = ",k)
# print("size of alphabet = ",len(alphabet),'\n')
freq = np.zeros(shape=(a+1, k))
for seq in alignment:
for i in range(k):
if seq[i] == '-':
# print('\tgap', seq[i])
freq[a][i] += 1
else:
freq[alphabet[seq[i]]][i] +=1
# print('\tbase', seq[i])
n = len(alignment)
print('\nFrequency matrix of alignment and seed columns:\n', freq)
seed = [x/n < theta for x in freq[a]]
# print('seed:',seed)
print(" ", ' '.join(list('+' if i else '-' for i in seed)))
return seed
def normalize_matrices(T,E):
"""counts to probabilities: normalize across rows"""
for state in range(len(S)):
if sum(T[state]) > 0:
T[state] = T[state]/sum(T[state])
if sum(E[state]) >0:
E[state] = E[state]/sum(E[state])
return T, E
def state_transition(T, prev, kind, S):
"""return next Match, Del, Ins, or End state in [states] """
x=0
if S[prev][0] == 'M':
x=1
for nxt in range(prev+1+x, len(T)):
if S[nxt][0] == kind[0]:
T[prev][nxt] += 1
# print(' >Transition added:',((S[prev],prev),(S[nxt], nxt)))
return T, nxt
### Walk each sequence through it's hidden states and count transitions & emissions
seed = get_seed_alignment(alignment, theta, alphabet)
n = len(alignment)
k = len(alignment[0])
S = ['S','I0']+list(c+str(n) for n in range(1,sum(seed)+1) for c in "MDI")+['E']
E = np.zeros(shape=(len(S), len(alphabet.keys())))
T = np.zeros(shape=(len(S), len(S)))
for seq in alignment:
# 'state' is the hidden state col/row; i is pos in alignment
state = 0; i = 0
while i < k:
# seed: either match or del as hidden state
if seed[i]:
if seq[i] in alphabet:
T, state = state_transition(T, state, 'Match', S)
E[state][alphabet[seq[i]]] +=1
else:
T, state = state_transition(T, state, 'Deletion', S)
# not seed: either insert or nothing
else:
# count any emissions before next seed column
emits = []
while not seed[i]:
if seq[i] in alphabet:
emits.append(seq[i])
i += 1
if i == k:
break # hit end of sequence
i-=1
if len(emits)>0:
# count the transition(state, insertion)
T, state = state_transition(T, state, 'Insert', S)
# get all emissions(state) in insertion
for symbol in emits:
E[state][alphabet[symbol]] += 1
# count all symbols as cyclic transition t(ins_x, ins_x)
if len(emits) >1:
T[state][state] += len(emits)-1
# else do nothing, just gaps in align
i += 1
# from last state to 'End'
T, state = state_transition(T, state, 'End', S)
T,E = normalize_matrices(T,E)
return S, T, E
Sample Input:
0.289
--------
A B C D E
--------
EBA
E-D
EB-
EED
EBD
EBE
E-D
E-D
Sample Output:
S I0 M1 D1 I1 M2 D2 I2 E
S 0 0 1.0 0 0 0 0 0 0
I0 0 0 0 0 0 0 0 0 0
M1 0 0 0 0 0.625 0.375 0 0 0
D1 0 0 0 0 0 0 0 0 0
I1 0 0 0 0 0 0.8 0.2 0 0
M2 0 0 0 0 0 0 0 0 1.0
D2 0 0 0 0 0 0 0 0 1.0
I2 0 0 0 0 0 0 0 0 0
E 0 0 0 0 0 0 0 0 0
--------
A B C D E
S 0 0 0 0 0
I0 0 0 0 0 0
M1 0 0 0 0 1.0
D1 0 0 0 0 0
I1 0 0.8 0 0 0.2
M2 0.143 0 0 0.714 0.143
D2 0 0 0 0 0
I2 0 0 0 0 0
E 0 0 0 0 0
# Parsing input from txt and formatting output
def parse_HMM(file):
lines = f.readlines()
theta = float(lines[0].strip())
alphabet = lines[2].strip().split()
alphabet = dict(zip(alphabet, range(len(alphabet))))
alignment = [line.strip() for line in lines[4:]]
print("Alignment:")
for a in alignment:
print('\t', a)
print("Alphabet\n\t", ' '.join(alphabet))
print("Theta:", theta)
return alignment, alphabet, theta
### Sample data:
with open('/Users/jasonmoggridge/Dropbox/Rosalind/Coursera_textbook_track/Course6/data/10e.extra.txt') as f:
alignment, alphabet, theta = parse_HMM(f)
S, T, E = profileHMM(alignment, alphabet, theta)
T = np.around(T,3)
E = np.around(E,3)
alphabet = list(alphabet.keys())
print('\n\nPROFILE HMM---\nStates:', ' '.join(S), '\nAlphabet', alphabet)
print("\nTransition matrix\n", pd.DataFrame(T, index=S, columns=S))
print("\nEmission matrix\n", pd.DataFrame(E, index=S, columns=alphabet))
# lists =outputHMM(T,E,S, alphabet)
Given: A threshold θ and a pseudocount σ, followed by an alphabet Σ, followed by a multiple alignment Alignment whose strings are formed from Σ.
Return: The transition and emission probabilities of the profile HMM HMM(Alignment, θ, σ).
Exactly the same algorithm as above but with a modification to the normalization step to introduce small pseudocounts to allow transitions not in the input alignment (to accomodate query sequence with different transition/emission without probability dropping to 0 immediately). Just add 0.01 to each count before normalizing. Use matrix algebra for normalization: row = row/sum(row)
def pseudo_normalize(T, E):
# add pseudos and normalize Emission matrix
for each state i that isn't start, del, or end:
E_ij += 0.01 for all symbols j
E_i = E_i/ sum(E_i)
# add pseudos and normalize Transition matrix
for state i that isn't End state:
specify range of allowed transitions from S_i...
for next state j in allowed range that isn't End:
T_ij += 0.01
T_i = T_i/sum(T_i)
return T, E matrices
def pseudo_normalize(T, E):
"""adds pseudocounts to HMM matrices"""
# add pseudos and normalize E
for i in range(1, len(S)-1):
if S[i][0] != 'D':
E[i] += 0.01
E[i] = E[i]/ sum(E[i])
# add pseudos and normalize T
for i in range(len(S)-1):
start = 3*((i+1)//3)+1
end = start+3
for j in range(start, end):
if j < len(S):
T[i][j] += 0.01
T[i] = T[i]/sum(T[i])
return T, E
### Extra test data 1:
def parse_HMM(file):
lines = f.readlines()
[theta, sigma] = [float(i) for i in lines[0].strip().split('\t')]
alphabet = lines[2].strip().split()
alphabet = dict(zip(alphabet, range(len(alphabet))))
alignment = [line.strip() for line in lines[4:]]
print("Alignment:")
for a in alignment:
print('\t', a)
print("Alphabet\n\t", ' '.join(alphabet))
print("Theta:", theta, "\tSigma:", sigma)
return alignment, alphabet, theta, sigma
with open('./data/10f.extra.txt') as f:
alignment, alpha, theta, sigma = parse_HMM(f)
S, T, E = profileHMM(alignment, alpha, theta)
alpha = list(alpha.keys())
T, E = pseudo_normalize(T, E)
T = np.around(T,3)
E = np.around(E,3)
print('\n\nPROFILE HMM---\nStates:', ' '.join(S), '\nAlphabet', alphabet)
print("\nTransition matrix\n", pd.DataFrame(T, index=S, columns=S))
print("\nEmission matrix\n", pd.DataFrame(E, index=S, columns=alphabet))
### Output txt format
# with open('10e.out.txt','w') as file:
# for line in lists[:-1]:
# if line[0] != '-':
# file.write('\t'.join(line))
# else:
# file.write(line)
# file.write('\n')
# file.write('\t'.join(lists[-1]))
# lists =outputHMM(T,E,S, alpha)
# with open('./data/10f.extra.out.txt','w') as file:
# for line in lists[:-1]:
# if line[0] != '-':
# file.write('\t'.join(line))
# else:
# file.write(line)
# file.write('\n')
# file.write('\t'.join(lists[-1]))
# del(line, lists)