Skip to content

Instantly share code, notes, and snippets.

@dkohlsdorf
Created October 26, 2025 17:40
Show Gist options
  • Select an option

  • Save dkohlsdorf/aba0b87c5d29bf8a694b0ccdfa49d849 to your computer and use it in GitHub Desktop.

Select an option

Save dkohlsdorf/aba0b87c5d29bf8a694b0ccdfa49d849 to your computer and use it in GitHub Desktop.
Multiple Sequence Alignment
#include <algorithm>
#include <vector>
#include <tuple>
#include <cmath>
#include <cstdio>
#include <limits>
#include <memory>
const int SYMBOLIC_GAP = -1;
const std::vector<double> NUMERIC_GAP = std::vector<double>(0);
const int DIAG = 0;
const int UP = 1;
const int LEFT = 2;
template<typename T> using Column = std::vector<T>;
template<typename T> using Sequence = std::vector<Column<T>>;
using Vector = std::vector<double>;
using Symbol = int;
using NumericSequence = Sequence<Vector>;
using SymbolicSequence = Sequence<Symbol>;
template<typename T> class AlignmentUtil {
public:
virtual ~AlignmentUtil() = default;
virtual double score(const Column<T> &x, const Column<T> &y) const = 0;
virtual bool is_gap(const T &symbol) const = 0;
virtual T make_gap() const = 0;
};
class NumericAlignmentUtil : public AlignmentUtil<Vector> {
public:
NumericAlignmentUtil(double symmetric_error, double sigma)
: error(symmetric_error), sigma(sigma) {}
double score(const Column<Vector> &x, const Column<Vector> &y) const override {
/**
Scoring for numeric vectors using Gaussian-like similarity:
For each pair of vectors (v1, v2):
similarity(v1, v2) = exp(-euclidean_dist²(v1, v2) / (2 * sigma²))
The final score averages all pairwise similarities:
score = error * (2 * avg_similarity - 1)
This gives:
- score ≈ +error when vectors are very similar
- score ≈ -error when vectors are very different
- score ≈ 0 for moderate similarity
**/
// Filter out gaps from both columns
std::vector<Vector> valid_x;
std::vector<Vector> valid_y;
for (const auto& vec : x) {
if (!is_gap(vec)) {
valid_x.push_back(vec);
}
}
for (const auto& vec : y) {
if (!is_gap(vec)) {
valid_y.push_back(vec);
}
}
// If either column has no valid vectors, return negative score
if (valid_x.empty() || valid_y.empty()) {
return -this->error;
}
// Compute all pairwise similarities
double total_similarity = 0.0;
int count = 0;
for (const auto& v1 : valid_x) {
for (const auto& v2 : valid_y) {
double dist_sq = euclidean_distance_squared(v1, v2);
double similarity = std::exp(-dist_sq / (2.0 * this->sigma * this->sigma));
total_similarity += similarity;
count++;
}
}
// Average similarity and scale to [-error, +error]
double avg_similarity = total_similarity / count;
return this->error * (2.0 * avg_similarity - 1.0);
}
bool is_gap(const Vector &vec) const override {
return vec.empty();
}
Vector make_gap() const override {
return NUMERIC_GAP;
}
private:
double error;
double sigma;
double euclidean_distance_squared(const Vector &v1, const Vector &v2) const {
// Vectors must have the same dimension
if (v1.size() != v2.size()) {
return std::numeric_limits<double>::max();
}
double dist_sq = 0.0;
for (size_t i = 0; i < v1.size(); i++) {
double diff = v1[i] - v2[i];
dist_sq += diff * diff;
}
return dist_sq;
}
};
class SymbolicAlignmentUtil : public AlignmentUtil<Symbol> {
public:
SymbolicAlignmentUtil(int alphabet_size, double symmetric_error)
: alphabet_size(alphabet_size), error(symmetric_error) {}
double score(const Column<Symbol> &x, const Column<Symbol> &y) const override {
/**
The scoring logic can be mathematically simplified:
score = Σ(i,j) counts_x[i] * counts_y[j] * s(i,j)
where s(i,j) = +error if i==j and i>0 (diagonal matches)
= -error otherwise (mismatches and gaps)
This equals:
= error * Σ(diagonal_matches) - error * Σ(all_other_pairs)
= error * Σ(diagonal_matches) - error * (total_pairs - diagonal_matches)
= 2 * error * Σ(diagonal_matches) - error * total_x * total_y
This allows us to reduce O(n²) to O(n) with a single pass!
**/
std::vector<int> counts_x(this->alphabet_size + 1, 0);
std::vector<int> counts_y(this->alphabet_size + 1, 0);
this->count(x, counts_x);
this->count(y, counts_y);
// Single-pass optimization: calculate totals and diagonal matches together
// Complexity: O(n²) → O(n) where n = alphabet_size + 1
double total_x = 0.0;
double total_y = 0.0;
double diagonal_matches = 0.0;
for(int i = 0; i < this->alphabet_size + 1; i++) {
total_x += counts_x[i];
total_y += counts_y[i];
// Only count non-gap diagonal matches (i > 0 skips gap index)
if (i > 0) {
diagonal_matches += counts_x[i] * counts_y[i];
}
}
// Prevent division by zero
if (total_x == 0.0 || total_y == 0.0) {
return 0.0;
}
// Mathematical optimization:
// score = 2 * error * diagonal_matches - error * total_x * total_y
// Simplified: score / (total_x * total_y) = 2 * error * diagonal_matches / (total_x * total_y) - error
return 2.0 * this->error * diagonal_matches / (total_x * total_y) - this->error;
}
bool is_gap(const Symbol &symbol) const override {
return symbol == SYMBOLIC_GAP;
}
Symbol make_gap() const override {
return SYMBOLIC_GAP;
}
private:
int alphabet_size;
double error;
void count(const Column<Symbol> &col, std::vector<int> &counts) const {
for(size_t i = 0; i < col.size(); i++) {
counts[col[i] + 1]++;
}
}
};
template<typename T>
class Alignment {
public:
Alignment(std::unique_ptr<AlignmentUtil<T>> util, double gap_penalty, int warping_band)
: util(std::move(util)), gap_penalty(gap_penalty), warping_band(warping_band) {}
double align(const Sequence<T> &x, const Sequence<T> &y, Sequence<T> &aligned_sequence) const;
private:
std::unique_ptr<AlignmentUtil<T>> util;
double gap_penalty;
int warping_band;
};
std::tuple<double, int> max3(double up, double diag, double left) {
double max_val = diag;
int max_dir = DIAG;
if (up > max_val) {
max_val = up;
max_dir = UP;
}
if (left > max_val) {
max_val = left;
max_dir = LEFT;
}
return std::make_tuple(max_val, max_dir);
}
template<typename T>
double Alignment<T>::align(const Sequence<T> &x,
const Sequence<T> &y,
Sequence<T> &aligned_sequence) const {
int n = x.size();
int m = y.size();
int min_band = std::abs(n - m) + 2;
int w = std::max(this->warping_band, min_band);
// DP table and traceback table
// Initialize all cells to -infinity (cells outside band should never be used)
std::vector<double> dp((n + 1) * (m + 1), -std::numeric_limits<double>::infinity());
std::vector<int> tb((n + 1) * (m + 1), -1);
// 2D index to 1D array mapping
auto idx = [m](int i, int j) {
return i * (m + 1) + j;
};
// Initialize origin
dp[idx(0, 0)] = 0.0;
tb[idx(0, 0)] = -1;
// Initialize boundaries
for(int i = 1; i < n + 1; i++) {
dp[idx(i, 0)] = -1 * i * gap_penalty;
tb[idx(i, 0)] = UP;
}
for(int j = 1; j < m + 1; j++) {
dp[idx(0, j)] = -1 * j * gap_penalty;
tb[idx(0, j)] = LEFT;
}
// Fill DP table with warping band
for(int i = 1; i < n + 1; i++) {
int start = std::max(1, i - w);
int stop = std::min(i + w, m + 1);
for(int j = start; j < stop; j++) {
double match_score = this->util->score(x[i - 1], y[j - 1]);
auto [alignment_score, trace] = max3(
dp[idx(i-1, j)] - gap_penalty,
dp[idx(i-1, j-1)] + match_score,
dp[idx(i, j-1)] - gap_penalty
);
dp[idx(i, j)] = alignment_score;
tb[idx(i, j)] = trace;
}
}
// Traceback to build alignment
int i = n;
int j = m;
while(i > 0 || j > 0) {
int trace = tb[idx(i, j)];
if(trace == DIAG && i > 0 && j > 0) {
// Match/mismatch: take from both sequences
const auto& x_col = x[i - 1];
const auto& y_col = y[j - 1];
Column<T> col;
col.reserve(x_col.size() + y_col.size());
col.insert(col.end(), x_col.begin(), x_col.end());
col.insert(col.end(), y_col.begin(), y_col.end());
aligned_sequence.push_back(std::move(col));
i -= 1;
j -= 1;
} else if(trace == UP || (i > 0 && j == 0)) {
// Gap in y: take from x only
const auto& x_col = x[i - 1];
int y_count = (j > 0) ? y[j - 1].size() : ((m > 0) ? y[0].size() : 0);
Column<T> col;
col.reserve(x_col.size() + y_count);
col.insert(col.end(), x_col.begin(), x_col.end());
// Add gaps for y sequences
T gap = this->util->make_gap();
for(int yj = 0; yj < y_count; yj++) {
col.push_back(gap);
}
aligned_sequence.push_back(std::move(col));
i -= 1;
} else if(trace == LEFT || (j > 0 && i == 0)) {
// Gap in x: take from y only
const auto& y_col = y[j - 1];
int x_count = (i > 0) ? x[i - 1].size() : ((n > 0) ? x[0].size() : 0);
Column<T> col;
col.reserve(x_count + y_col.size());
// Add gaps for x sequences
T gap = this->util->make_gap();
for(int xi = 0; xi < x_count; xi++) {
col.push_back(gap);
}
col.insert(col.end(), y_col.begin(), y_col.end());
aligned_sequence.push_back(std::move(col));
j -= 1;
} else {
// Shouldn't reach here if tb is properly initialized
break;
}
}
// Reverse alignment since we built it backwards
std::reverse(aligned_sequence.begin(), aligned_sequence.end());
return dp[idx(n, m)];
}
int main(int argc, char** argv) {
// Test the optimized scoring function
SymbolicAlignmentUtil util(4, 1.0);
// Test case 1: Identical columns
Column<Symbol> col1 = {0, 1, 2};
Column<Symbol> col2 = {0, 1, 2};
double score1 = util.score(col1, col2);
// Test case 2: Different columns
Column<Symbol> col3 = {0, 0, 1};
Column<Symbol> col4 = {1, 2, 2};
double score2 = util.score(col3, col4);
// Test case 3: With gaps
Column<Symbol> col5 = {-1, 0, 1};
Column<Symbol> col6 = {0, 1, 2};
double score3 = util.score(col5, col6);
// Print results for verification
printf("=== Scoring Function Tests ===\n");
printf("Test 1 (identical): %.6f\n", score1);
printf("Test 2 (different): %.6f\n", score2);
printf("Test 3 (with gaps): %.6f\n", score3);
// Test MSA alignment
printf("\n=== MSA Alignment Test ===\n");
// Create two simple sequences to align
// Sequence X: A-C-T
SymbolicSequence seq_x = {
{0}, // Column with symbol A (0)
{1}, // Column with symbol C (1)
{2} // Column with symbol T (2)
};
// Sequence Y: A-G-C-T
SymbolicSequence seq_y = {
{0}, // Column with symbol A (0)
{3}, // Column with symbol G (3)
{1}, // Column with symbol C (1)
{2} // Column with symbol T (2)
};
// Create alignment utility and alignment object
auto align_util = std::make_unique<SymbolicAlignmentUtil>(4, 1.0);
Alignment<Symbol> aligner(std::move(align_util), 0.5, 10);
// Perform alignment
SymbolicSequence aligned;
double alignment_score = aligner.align(seq_x, seq_y, aligned);
printf("Input sequence X (length %zu): ", seq_x.size());
for (const auto& col : seq_x) {
printf("%d ", col[0]);
}
printf("\n");
printf("Input sequence Y (length %zu): ", seq_y.size());
for (const auto& col : seq_y) {
printf("%d ", col[0]);
}
printf("\n");
printf("Alignment score: %.6f\n", alignment_score);
printf("Aligned sequence (length %zu):\n", aligned.size());
for (size_t i = 0; i < aligned.size(); i++) {
printf(" Position %zu: [", i);
for (size_t j = 0; j < aligned[i].size(); j++) {
if (aligned[i][j] == SYMBOLIC_GAP) {
printf("GAP");
} else {
printf("%d", aligned[i][j]);
}
if (j < aligned[i].size() - 1) printf(", ");
}
printf("]\n");
}
// Test MSA-to-MSA alignment
printf("\n=== MSA-to-MSA Alignment Test ===\n");
printf("Aligning two multiple sequence alignments\n\n");
// MSA 1: Contains 3 aligned sequences
// Position 0: [A, A, A] - all three sequences have A
// Position 1: [C, C, T] - two C's and one T
// Position 2: [G, G, G] - all three sequences have G
SymbolicSequence msa1 = {
{0, 0, 0}, // All A's
{1, 1, 2}, // Two C's and one T
{3, 3, 3} // All G's
};
// MSA 2: Contains 2 aligned sequences
// Position 0: [A, A] - both have A
// Position 1: [GAP, C] - first has gap, second has C
// Position 2: [T, T] - both have T
// Position 3: [G, G] - both have G
SymbolicSequence msa2 = {
{0, 0}, // Both A's
{-1, 1}, // Gap and C
{2, 2}, // Both T's
{3, 3} // Both G's
};
printf("MSA 1 (3 sequences, length %zu):\n", msa1.size());
for (size_t i = 0; i < msa1.size(); i++) {
printf(" Column %zu: [", i);
for (size_t j = 0; j < msa1[i].size(); j++) {
if (msa1[i][j] == SYMBOLIC_GAP) {
printf("GAP");
} else {
printf("%d", msa1[i][j]);
}
if (j < msa1[i].size() - 1) printf(", ");
}
printf("]\n");
}
printf("\nMSA 2 (2 sequences, length %zu):\n", msa2.size());
for (size_t i = 0; i < msa2.size(); i++) {
printf(" Column %zu: [", i);
for (size_t j = 0; j < msa2[i].size(); j++) {
if (msa2[i][j] == SYMBOLIC_GAP) {
printf("GAP");
} else {
printf("%d", msa2[i][j]);
}
if (j < msa2[i].size() - 1) printf(", ");
}
printf("]\n");
}
// Align the two MSAs
auto msa_align_util = std::make_unique<SymbolicAlignmentUtil>(4, 1.0);
Alignment<Symbol> msa_aligner(std::move(msa_align_util), 0.5, 10);
SymbolicSequence aligned_msa;
double msa_score = msa_aligner.align(msa1, msa2, aligned_msa);
printf("\nMSA-to-MSA alignment score: %.6f\n", msa_score);
printf("Aligned MSA (combines all 5 sequences, length %zu):\n", aligned_msa.size());
for (size_t i = 0; i < aligned_msa.size(); i++) {
printf(" Column %zu: [", i);
for (size_t j = 0; j < aligned_msa[i].size(); j++) {
if (aligned_msa[i][j] == SYMBOLIC_GAP) {
printf("GAP");
} else {
printf("%d", aligned_msa[i][j]);
}
if (j < aligned_msa[i].size() - 1) printf(", ");
}
printf("] (%zu total symbols: first 3 from MSA1, last 2 from MSA2)\n", aligned_msa[i].size());
}
// Test Numeric alignment
printf("\n=== Numeric Vector Alignment Test ===\n");
printf("Aligning sequences of numeric vectors\n\n");
// Numeric sequence 1: Three 2D vectors (single sequence)
NumericSequence num_seq1 = {
{{1.0, 2.0}}, // Position 0: vector [1.0, 2.0]
{{3.0, 4.0}}, // Position 1: vector [3.0, 4.0]
{{5.0, 6.0}} // Position 2: vector [5.0, 6.0]
};
// Numeric sequence 2: Four 2D vectors (slightly different)
NumericSequence num_seq2 = {
{{1.1, 2.1}}, // Position 0: similar to seq1[0]
{{7.0, 8.0}}, // Position 1: different vector
{{3.1, 4.1}}, // Position 2: similar to seq1[1]
{{5.1, 6.1}} // Position 3: similar to seq1[2]
};
printf("Numeric sequence 1 (length %zu):\n", num_seq1.size());
for (size_t i = 0; i < num_seq1.size(); i++) {
printf(" Column %zu: [", i);
for (size_t j = 0; j < num_seq1[i].size(); j++) {
printf("[");
for (size_t k = 0; k < num_seq1[i][j].size(); k++) {
printf("%.1f", num_seq1[i][j][k]);
if (k < num_seq1[i][j].size() - 1) printf(", ");
}
printf("]");
if (j < num_seq1[i].size() - 1) printf(", ");
}
printf("]\n");
}
printf("\nNumeric sequence 2 (length %zu):\n", num_seq2.size());
for (size_t i = 0; i < num_seq2.size(); i++) {
printf(" Column %zu: [", i);
for (size_t j = 0; j < num_seq2[i].size(); j++) {
printf("[");
for (size_t k = 0; k < num_seq2[i][j].size(); k++) {
printf("%.1f", num_seq2[i][j][k]);
if (k < num_seq2[i][j].size() - 1) printf(", ");
}
printf("]");
if (j < num_seq2[i].size() - 1) printf(", ");
}
printf("]\n");
}
// Create numeric alignment with sigma=1.0 (controls similarity sensitivity)
auto num_align_util = std::make_unique<NumericAlignmentUtil>(1.0, 1.0);
Alignment<Vector> num_aligner(std::move(num_align_util), 0.5, 10);
NumericSequence aligned_num;
double num_score = num_aligner.align(num_seq1, num_seq2, aligned_num);
printf("\nNumeric alignment score: %.6f\n", num_score);
printf("Aligned numeric sequence (length %zu):\n", aligned_num.size());
for (size_t i = 0; i < aligned_num.size(); i++) {
printf(" Column %zu: [", i);
for (size_t j = 0; j < aligned_num[i].size(); j++) {
if (aligned_num[i][j].empty()) {
printf("GAP");
} else {
printf("[");
for (size_t k = 0; k < aligned_num[i][j].size(); k++) {
printf("%.1f", aligned_num[i][j][k]);
if (k < aligned_num[i][j].size() - 1) printf(", ");
}
printf("]");
}
if (j < aligned_num[i].size() - 1) printf(", ");
}
printf("]\n");
}
// Test Numeric MSA-to-MSA alignment
printf("\n=== Numeric MSA-to-MSA Alignment Test ===\n");
printf("Aligning two numeric multiple sequence alignments\n\n");
// Numeric MSA 1: 3 sequences aligned (3 vectors per column)
// Represents 3 feature vector sequences aligned together
NumericSequence num_msa1 = {
{{1.0, 2.0}, {1.1, 2.1}, {1.2, 2.2}}, // Column 0: 3 similar vectors
{{3.0, 4.0}, {3.0, 4.1}, {3.1, 4.0}}, // Column 1: 3 similar vectors
{{5.0, 6.0}, {5.1, 6.0}, {5.0, 6.1}} // Column 2: 3 similar vectors
};
// Numeric MSA 2: 2 sequences aligned (2 vectors per column)
// One has a gap in the middle
NumericSequence num_msa2 = {
{{1.0, 2.0}, {1.15, 2.05}}, // Column 0: 2 similar vectors
{{}, {3.2, 4.2}}, // Column 1: gap and a vector
{{7.0, 8.0}, {7.1, 8.1}}, // Column 2: different vectors
{{5.0, 6.0}, {5.05, 6.05}} // Column 3: similar to msa1[2]
};
printf("Numeric MSA 1 (3 sequences, length %zu):\n", num_msa1.size());
for (size_t i = 0; i < num_msa1.size(); i++) {
printf(" Column %zu: [", i);
for (size_t j = 0; j < num_msa1[i].size(); j++) {
if (num_msa1[i][j].empty()) {
printf("GAP");
} else {
printf("[");
for (size_t k = 0; k < num_msa1[i][j].size(); k++) {
printf("%.1f", num_msa1[i][j][k]);
if (k < num_msa1[i][j].size() - 1) printf(", ");
}
printf("]");
}
if (j < num_msa1[i].size() - 1) printf(", ");
}
printf("]\n");
}
printf("\nNumeric MSA 2 (2 sequences, length %zu):\n", num_msa2.size());
for (size_t i = 0; i < num_msa2.size(); i++) {
printf(" Column %zu: [", i);
for (size_t j = 0; j < num_msa2[i].size(); j++) {
if (num_msa2[i][j].empty()) {
printf("GAP");
} else {
printf("[");
for (size_t k = 0; k < num_msa2[i][j].size(); k++) {
printf("%.1f", num_msa2[i][j][k]);
if (k < num_msa2[i][j].size() - 1) printf(", ");
}
printf("]");
}
if (j < num_msa2[i].size() - 1) printf(", ");
}
printf("]\n");
}
// Align the two numeric MSAs
auto num_msa_util = std::make_unique<NumericAlignmentUtil>(1.0, 1.0);
Alignment<Vector> num_msa_aligner(std::move(num_msa_util), 0.5, 10);
NumericSequence aligned_num_msa;
double num_msa_score = num_msa_aligner.align(num_msa1, num_msa2, aligned_num_msa);
printf("\nNumeric MSA-to-MSA alignment score: %.6f\n", num_msa_score);
printf("Aligned numeric MSA (combines all 5 sequences, length %zu):\n", aligned_num_msa.size());
for (size_t i = 0; i < aligned_num_msa.size(); i++) {
printf(" Column %zu: [", i);
for (size_t j = 0; j < aligned_num_msa[i].size(); j++) {
if (aligned_num_msa[i][j].empty()) {
printf("GAP");
} else {
printf("[");
for (size_t k = 0; k < aligned_num_msa[i][j].size(); k++) {
printf("%.2f", aligned_num_msa[i][j][k]);
if (k < aligned_num_msa[i][j].size() - 1) printf(", ");
}
printf("]");
}
if (j < aligned_num_msa[i].size() - 1) printf(", ");
}
printf("]\n");
}
printf("\nInterpretation: Each column contains 5 vectors total:\n");
printf(" - First 3 vectors from MSA1\n");
printf(" - Last 2 vectors from MSA2\n");
printf(" - GAP entries where sequences don't align\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment