-
-
Save ktym/7a4799fb055436dc2139308dcab802f0 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env ruby | |
| =begin | |
| # Pairwise sequence alignment algorithms | |
| Currently implements Needleman-Wunsch, Smith-Waterman | |
| and Needleman-Wunsch-Gotoh algorithms based on | |
| the Dynamic Programming (DP). | |
| ## Coordinates | |
| To treat (i,j) as 1-based coordinates (instead of Ruby's 0-based), | |
| use the following accessors: | |
| * DP::Matrix#get(i,j), DP::Matrix#set(i,j) | |
| * DP::Sequence#[i], DP::Sequence#[j] | |
| ``` | |
| M(i,j) | Target A C C A G T | |
| -------+-------------------------------------------------- | |
| Query | (0,0) i=1 i=2 i=3 i=4 i=5 i=6 | |
| A | j=1 (1,1) (2,1) (3,1) (4,1) (5,1) (6,1) | |
| C | j=2 (1,2) (2,2) (3,2) (4,2) (5,2) (6,2) | |
| A | j=3 (1,3) (2,3) (3,3) (4,3) (5,3) (6,3) | |
| G | j=4 (1,4) (2,4) (3,4) (4,4) (5,4) (6,4) | |
| C | j=5 (1,5) (2,5) (3,5) (4,5) (5,5) (6,5) | |
| ``` | |
| ## Accessors | |
| Each sub-class has the following accessors to modify parameters | |
| before the calculation of DP: | |
| ``` | |
| aligner = DP.new(target, query) | |
| aligner.gap_penalty = 2 # gap open penalty | |
| aligner.ext_penalty = 1 # gap extention penalty | |
| aligner.match = 1 | |
| aligner.mis_match = -1 | |
| ``` | |
| ## Calculation | |
| In the DP#dp method, init_matrix, calc_matrix and trace_back functions | |
| are called which are required to be implemented in each sub-class. | |
| ``` | |
| aligner.dp | |
| # => call init_matrix(), calc_matrix() and trace_back() then print alignment | |
| ``` | |
| ## References | |
| * Needleman SB, Wunsch CD: A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Mol Biol 1970, 48:443-53. | |
| * Smith TF, Waterman MS: Identification of common molecular subsequences. J Mol Biol 1981, 147:195-7. | |
| * Gotoh O: An improved algorithm for matching biological sequences. J Mol Biol 1982, 162:705-8. | |
| ## Notes | |
| * [Lecture notes by Terai](http://asailab.cb.k.u-tokyo.ac.jp/wp-content/uploads/2014/06/20181002_terai_pairwisealignment.mod2_.pdf) | |
| * [Lecture notes by Asai](http://asailab.cb.k.u-tokyo.ac.jp/asai/lecture/H30Genome3_Alignment.pdf) | |
| * [Lecture notes by Yokoyama](https://paper.dropbox.com/doc/--Af886MpyNXIKPsDWulpka5t1Ag-dqPm9SbT1DoLXnSyzu6lo) | |
| * [DP basics in Ruby](https://www.jabba.cloud/20161020172918/) | |
| ## Copyright | |
| MIT license, Toshiaki Katayama <ktym@dbcls.jp> | |
| =end | |
| # Generic interface | |
| class DP | |
| attr_accessor :gap_penalty, :ext_penalty, :match, :mis_match | |
| def initialize(target, query) | |
| @gap_penalty = 2 | |
| @ext_penalty = 1 | |
| @match = 1 | |
| @mis_match = -1 | |
| @target = Sequence.new(target) | |
| @query = Sequence.new(query) | |
| end | |
| def s(x, y) | |
| return x == y ? @match : @mis_match | |
| end | |
| def init_matrix | |
| raise NotImplementedError.new("#{self.class}##{__method__}: Create @matrix = Matrix.new(tlen, qlen)") | |
| end | |
| def calc_matrix | |
| raise NotImplementedError.new("#{self.class}##{__method__}: Update @matrix with .get() and .set()") | |
| end | |
| def trace_back | |
| raise NotImplementedError.new("#{self.class}##{__method__}: Generate @target_align, @query_align") | |
| end | |
| class Sequence | |
| def initialize(seq) | |
| @seq = seq | |
| end | |
| def to_s | |
| @seq | |
| end | |
| def [](one_base_position) | |
| zero_base_position = one_base_position - 1 | |
| @seq[zero_base_position] | |
| end | |
| def length | |
| @seq.length | |
| end | |
| end | |
| class Matrix | |
| def initialize(x, y, default = 0) | |
| @matrix = Array.new(x+1) { Array.new(y+1, default) } | |
| end | |
| def get(i, j) | |
| @matrix[j][i] | |
| end | |
| def set(i, j, v) | |
| @matrix[j][i] = v | |
| end | |
| end | |
| def dp | |
| puts "Target: #{@target}" | |
| puts "Query: #{@query}" | |
| init_matrix | |
| pp @matrix if $DEBUG | |
| calc_matrix | |
| pp @matrix | |
| trace_back | |
| puts "Target: #{@target_align}" | |
| puts "Query: #{@query_align}" | |
| end | |
| end | |
| # Global alignment | |
| class NeedlemanWunsch < DP | |
| def init_matrix | |
| @matrix = Matrix.new(@query.length, @target.length, 0) | |
| 1.upto(@target.length) do |i| | |
| value = - (i * @gap_penalty) | |
| @matrix.set(i, 0, value) | |
| end | |
| 1.upto(@query.length) do |j| | |
| value = - (j * @gap_penalty) | |
| @matrix.set(0, j, value) | |
| end | |
| end | |
| def calc_matrix | |
| 1.upto(@target.length) do |i| | |
| 1.upto(@query.length) do |j| | |
| value = [ | |
| @matrix.get(i-1, j-1) + s(@target[i], @query[j]), | |
| @matrix.get(i, j-1) - @gap_penalty, | |
| @matrix.get(i-1, j ) - @gap_penalty, | |
| ].max | |
| @matrix.set(i, j, value) | |
| end | |
| end | |
| end | |
| def trace_back | |
| i = @target.length | |
| j = @query.length | |
| target_trace = "" | |
| query_trace = "" | |
| score = @matrix.get(i, j) | |
| while i > 0 or j > 0 | |
| case score | |
| when @matrix.get(i-1, j) - @gap_penalty | |
| score = @matrix.get(i-1, j) | |
| target_trace += @target[i] | |
| query_trace += '-' | |
| i -= 1 | |
| when @matrix.get(i, j-1) - @gap_penalty | |
| score = @matrix.get(i, j-1) | |
| target_trace += '-' | |
| query_trace += @query[j] | |
| j -= 1 | |
| else | |
| score = @matrix.get(i-1, j-1) | |
| target_trace += @target[i] | |
| query_trace += @query[j] | |
| i -= 1 | |
| j -= 1 | |
| end | |
| end | |
| @target_align = target_trace.reverse | |
| @query_align = query_trace.reverse | |
| end | |
| end | |
| # Local alignment | |
| class SmithWaterman < DP | |
| def init_matrix | |
| @matrix = Matrix.new(@query.length, @target.length, 0) | |
| end | |
| def calc_matrix | |
| @best = {:score => 0, :i => 0, :j => 0} | |
| 1.upto(@target.length) do |i| | |
| 1.upto(@query.length) do |j| | |
| value = [ | |
| @matrix.get(i-1, j-1) + s(@target[i], @query[j]), | |
| @matrix.get(i, j-1) - @gap_penalty, | |
| @matrix.get(i-1, j ) - @gap_penalty, | |
| 0, | |
| ].max | |
| @matrix.set(i, j, value) | |
| if @matrix.get(i, j) > @best[:score] | |
| @best[:score] = @matrix.get(i, j) | |
| @best[:i] = i | |
| @best[:j] = j | |
| end | |
| end | |
| end | |
| end | |
| def trace_back | |
| i = @best[:i] | |
| j = @best[:j] | |
| target_trace = "" | |
| query_trace = "" | |
| score = @matrix.get(i, j) | |
| while i > 0 or j > 0 | |
| case score | |
| when @matrix.get(i-1, j) - @gap_penalty | |
| score = @matrix.get(i-1, j) | |
| target_trace += @target[i] | |
| query_trace += '-' | |
| i -= 1 | |
| when @matrix.get(i-1, j) - @gap_penalty | |
| score = @matrix.get(i, j-1) | |
| target_trace += '-' | |
| query_trace += @query[j] | |
| j -= 1 | |
| else | |
| score = @matrix.get(i-1, j-1) | |
| target_trace += @target[i] | |
| query_trace += @query[j] | |
| break if score == 0 | |
| i -= 1 | |
| j -= 1 | |
| end | |
| end | |
| @target_align = target_trace.reverse | |
| @query_align = query_trace.reverse | |
| end | |
| end | |
| # Global alignment with affine gap penalty | |
| class NeedlemanWunschGotoh < DP | |
| Inf = Float::INFINITY | |
| def init_matrix | |
| @matrix = { | |
| :m => Matrix.new(@query.length, @target.length), | |
| :x => Matrix.new(@query.length, @target.length), | |
| :y => Matrix.new(@query.length, @target.length), | |
| } | |
| @matrix[:m].set(0, 0, 0) | |
| @matrix[:x].set(0, 0, -Inf) | |
| @matrix[:y].set(0, 0, -Inf) | |
| 1.upto(@target.length) do |i| | |
| @matrix[:m].set(i, 0, -Inf) | |
| @matrix[:y].set(i, 0, -Inf) | |
| value = - @gap_penalty - @ext_penalty * (i-1) | |
| @matrix[:x].set(i, 0, value) | |
| end | |
| 1.upto(@query.length) do |j| | |
| @matrix[:m].set(0, j, -Inf) | |
| @matrix[:x].set(0, j, -Inf) | |
| value = - @gap_penalty - @ext_penalty * (j-1) | |
| @matrix[:y].set(0, j, value) | |
| end | |
| end | |
| def calc_matrix | |
| 1.upto(@target.length) do |i| | |
| 1.upto(@query.length) do |j| | |
| s_value = s(@target[i], @query[j]) | |
| value_m = [ | |
| @matrix[:m].get(i-1, j-1) + s_value, | |
| @matrix[:x].get(i-1, j-1) + s_value, | |
| @matrix[:y].get(i-1, j-1) + s_value, | |
| ].max | |
| @matrix[:m].set(i, j, value_m) | |
| value_x = [ | |
| @matrix[:m].get(i-1, j) - @gap_penalty, | |
| @matrix[:x].get(i-1, j) - @ext_penalty, | |
| ].max | |
| @matrix[:x].set(i, j, value_x) | |
| value_y = [ | |
| @matrix[:m].get(i, j-1) - @gap_penalty, | |
| @matrix[:y].get(i, j-1) - @ext_penalty, | |
| ].max | |
| @matrix[:y].set(i, j, value_y) | |
| end | |
| end | |
| end | |
| def trace_back | |
| i = @target.length | |
| j = @query.length | |
| score = { | |
| :m => @matrix[:m].get(i, j), | |
| :x => @matrix[:x].get(i, j), | |
| :y => @matrix[:y].get(i, j), | |
| } | |
| cur = score.max{|a,b| a[1] <=> b[1]}.first | |
| target_trace = "" | |
| query_trace = "" | |
| while i > 0 or j > 0 | |
| score = @matrix[cur].get(i, j) | |
| s_value = s(@target[i], @query[j]) | |
| case cur | |
| when :m | |
| case score | |
| when @matrix[:m].get(i-1, j-1) + s_value | |
| score = @matrix[:m].get(i-1, j-1) | |
| cur = :m | |
| when @matrix[:x].get(i-1, j-1) + s_value | |
| score = @matrix[:x].get(i-1, j-1) | |
| cur = :x | |
| when @matrix[:y].get(i-1, j-1) + s_value | |
| score = @matrix[:y].get(i-1, j-1) | |
| cur = :y | |
| end | |
| target_trace += @target[i] | |
| query_trace += @query[j] | |
| i -= 1 | |
| j -= 1 | |
| when :x | |
| case score | |
| when @matrix[:m].get(i-1, j) - @gap_penalty | |
| score = @matrix[:m].get(i-1, j) | |
| cur = :m | |
| when @matrix[:x].get(i-1, j) - @ext_penalty | |
| score = @matrix[:x].get(i-1, j) | |
| cur = :x | |
| end | |
| target_trace += @target[i] | |
| query_trace += '-' | |
| i -= 1 | |
| when :y | |
| case score | |
| when @matrix[:m].get(i, j-1) - @gap_penalty | |
| score = @matrix[:m].get(i, j-1) | |
| cur = :m | |
| when @matrix[:y].get(i, j-1) - @ext_penalty | |
| score = @matrix[:y].get(i, j-1) | |
| cur = :y | |
| end | |
| target_trace += '-' | |
| query_trace += @query[j] | |
| j -= 1 | |
| end | |
| end | |
| @target_align = target_trace.reverse | |
| @query_align = query_trace.reverse | |
| end | |
| end | |
| if __FILE__ == $0 | |
| target = ARGV.shift || 'ACCAGT' | |
| query = ARGV.shift || 'ACAGC' | |
| puts "# Needleman-Wunsch" | |
| nw = NeedlemanWunsch.new(target, query) | |
| nw.dp | |
| puts "# Smith-Waterman" | |
| sw = SmithWaterman.new(target, query) | |
| sw.dp | |
| puts "# Needleman-Wunsch-Gotoh" | |
| nwg = NeedlemanWunschGotoh.new(target, query) | |
| nwg.dp | |
| end | |
ktym
commented
Jun 30, 2019
Bug fixed:)
Needleman-Wunsch
Target: ACCAGT
Query: ACAGC
#<DP::Matrix:0x00007fb6cd885ff0
@matrix=
[[0, -2, -4, -6, -8, -10, -12],
[-2, 1, -1, -3, -5, -7, -9],
[-4, -1, 2, 0, -2, -4, -6],
[-6, -3, 0, 1, 1, -1, -3],
[-8, -5, -2, -1, 0, 2, 0],
[-10, -7, -4, -1, -2, 0, 1]]>
Target: ACCAGT
Query: AC-AGC
Smith-Waterman
Target: ACCAGT
Query: ACAGC
#<DP::Matrix:0x00007fb6cd110d10
@matrix=
[[0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 1, 0, 0],
[0, 0, 2, 1, 0, 0, 0],
[0, 1, 0, 1, 2, 0, 0],
[0, 0, 0, 0, 0, 3, 1],
[0, 0, 1, 1, 0, 1, 2]]>
Target: CAG
Query: CAG
Needleman-Wunsch-Gotoh
Target: ACCAGT
Query: ACAGC
{:m=>
#<DP::Matrix:0x00007fb6cd1366a0
@matrix=
[[0, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity],
[-Infinity, 1, -3, -4, -3, -6, -7],
[-Infinity, -3, 2, 0, -3, -4, -5],
[-Infinity, -2, -2, 1, 1, -2, -3],
[-Infinity, -5, -3, -1, 0, 2, -2],
[-Infinity, -6, -2, 0, -2, -1, 1]]>,
:x=>
#<DP::Matrix:0x00007fb6cd1361a0
@matrix=
[[-Infinity, -2, -3, -4, -5, -6, -7],
[-Infinity, -Infinity, -1, -2, -3, -4, -5],
[-Infinity, -Infinity, -5, 0, -1, -2, -3],
[-Infinity, -Infinity, -4, -4, -1, -1, -2],
[-Infinity, -Infinity, -7, -5, -3, -2, 0],
[-Infinity, -Infinity, -8, -4, -2, -3, -3]]>,
:y=>
#<DP::Matrix:0x00007fb6cd135cc8
@matrix=
[[-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity],
[-2, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity],
[-3, -1, -5, -6, -5, -8, -9],
[-4, -2, 0, -2, -5, -6, -7],
[-5, -3, -1, -1, -1, -4, -5],
[-6, -4, -2, -2, -2, 0, -4]]>}
Target: ACCAGT
Query: A-CAGC
Moved to https://github.com/ktym/dp