Created
October 26, 2025 17:40
-
-
Save dkohlsdorf/aba0b87c5d29bf8a694b0ccdfa49d849 to your computer and use it in GitHub Desktop.
Multiple Sequence Alignment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <algorithm> | |
| #include <vector> | |
| #include <tuple> | |
| #include <cmath> | |
| #include <cstdio> | |
| #include <limits> | |
| #include <memory> | |
| const int SYMBOLIC_GAP = -1; | |
| const std::vector<double> NUMERIC_GAP = std::vector<double>(0); | |
| const int DIAG = 0; | |
| const int UP = 1; | |
| const int LEFT = 2; | |
| template<typename T> using Column = std::vector<T>; | |
| template<typename T> using Sequence = std::vector<Column<T>>; | |
| using Vector = std::vector<double>; | |
| using Symbol = int; | |
| using NumericSequence = Sequence<Vector>; | |
| using SymbolicSequence = Sequence<Symbol>; | |
| template<typename T> class AlignmentUtil { | |
| public: | |
| virtual ~AlignmentUtil() = default; | |
| virtual double score(const Column<T> &x, const Column<T> &y) const = 0; | |
| virtual bool is_gap(const T &symbol) const = 0; | |
| virtual T make_gap() const = 0; | |
| }; | |
| class NumericAlignmentUtil : public AlignmentUtil<Vector> { | |
| public: | |
| NumericAlignmentUtil(double symmetric_error, double sigma) | |
| : error(symmetric_error), sigma(sigma) {} | |
| double score(const Column<Vector> &x, const Column<Vector> &y) const override { | |
| /** | |
| Scoring for numeric vectors using Gaussian-like similarity: | |
| For each pair of vectors (v1, v2): | |
| similarity(v1, v2) = exp(-euclidean_dist²(v1, v2) / (2 * sigma²)) | |
| The final score averages all pairwise similarities: | |
| score = error * (2 * avg_similarity - 1) | |
| This gives: | |
| - score ≈ +error when vectors are very similar | |
| - score ≈ -error when vectors are very different | |
| - score ≈ 0 for moderate similarity | |
| **/ | |
| // Filter out gaps from both columns | |
| std::vector<Vector> valid_x; | |
| std::vector<Vector> valid_y; | |
| for (const auto& vec : x) { | |
| if (!is_gap(vec)) { | |
| valid_x.push_back(vec); | |
| } | |
| } | |
| for (const auto& vec : y) { | |
| if (!is_gap(vec)) { | |
| valid_y.push_back(vec); | |
| } | |
| } | |
| // If either column has no valid vectors, return negative score | |
| if (valid_x.empty() || valid_y.empty()) { | |
| return -this->error; | |
| } | |
| // Compute all pairwise similarities | |
| double total_similarity = 0.0; | |
| int count = 0; | |
| for (const auto& v1 : valid_x) { | |
| for (const auto& v2 : valid_y) { | |
| double dist_sq = euclidean_distance_squared(v1, v2); | |
| double similarity = std::exp(-dist_sq / (2.0 * this->sigma * this->sigma)); | |
| total_similarity += similarity; | |
| count++; | |
| } | |
| } | |
| // Average similarity and scale to [-error, +error] | |
| double avg_similarity = total_similarity / count; | |
| return this->error * (2.0 * avg_similarity - 1.0); | |
| } | |
| bool is_gap(const Vector &vec) const override { | |
| return vec.empty(); | |
| } | |
| Vector make_gap() const override { | |
| return NUMERIC_GAP; | |
| } | |
| private: | |
| double error; | |
| double sigma; | |
| double euclidean_distance_squared(const Vector &v1, const Vector &v2) const { | |
| // Vectors must have the same dimension | |
| if (v1.size() != v2.size()) { | |
| return std::numeric_limits<double>::max(); | |
| } | |
| double dist_sq = 0.0; | |
| for (size_t i = 0; i < v1.size(); i++) { | |
| double diff = v1[i] - v2[i]; | |
| dist_sq += diff * diff; | |
| } | |
| return dist_sq; | |
| } | |
| }; | |
| class SymbolicAlignmentUtil : public AlignmentUtil<Symbol> { | |
| public: | |
| SymbolicAlignmentUtil(int alphabet_size, double symmetric_error) | |
| : alphabet_size(alphabet_size), error(symmetric_error) {} | |
| double score(const Column<Symbol> &x, const Column<Symbol> &y) const override { | |
| /** | |
| The scoring logic can be mathematically simplified: | |
| score = Σ(i,j) counts_x[i] * counts_y[j] * s(i,j) | |
| where s(i,j) = +error if i==j and i>0 (diagonal matches) | |
| = -error otherwise (mismatches and gaps) | |
| This equals: | |
| = error * Σ(diagonal_matches) - error * Σ(all_other_pairs) | |
| = error * Σ(diagonal_matches) - error * (total_pairs - diagonal_matches) | |
| = 2 * error * Σ(diagonal_matches) - error * total_x * total_y | |
| This allows us to reduce O(n²) to O(n) with a single pass! | |
| **/ | |
| std::vector<int> counts_x(this->alphabet_size + 1, 0); | |
| std::vector<int> counts_y(this->alphabet_size + 1, 0); | |
| this->count(x, counts_x); | |
| this->count(y, counts_y); | |
| // Single-pass optimization: calculate totals and diagonal matches together | |
| // Complexity: O(n²) → O(n) where n = alphabet_size + 1 | |
| double total_x = 0.0; | |
| double total_y = 0.0; | |
| double diagonal_matches = 0.0; | |
| for(int i = 0; i < this->alphabet_size + 1; i++) { | |
| total_x += counts_x[i]; | |
| total_y += counts_y[i]; | |
| // Only count non-gap diagonal matches (i > 0 skips gap index) | |
| if (i > 0) { | |
| diagonal_matches += counts_x[i] * counts_y[i]; | |
| } | |
| } | |
| // Prevent division by zero | |
| if (total_x == 0.0 || total_y == 0.0) { | |
| return 0.0; | |
| } | |
| // Mathematical optimization: | |
| // score = 2 * error * diagonal_matches - error * total_x * total_y | |
| // Simplified: score / (total_x * total_y) = 2 * error * diagonal_matches / (total_x * total_y) - error | |
| return 2.0 * this->error * diagonal_matches / (total_x * total_y) - this->error; | |
| } | |
| bool is_gap(const Symbol &symbol) const override { | |
| return symbol == SYMBOLIC_GAP; | |
| } | |
| Symbol make_gap() const override { | |
| return SYMBOLIC_GAP; | |
| } | |
| private: | |
| int alphabet_size; | |
| double error; | |
| void count(const Column<Symbol> &col, std::vector<int> &counts) const { | |
| for(size_t i = 0; i < col.size(); i++) { | |
| counts[col[i] + 1]++; | |
| } | |
| } | |
| }; | |
| template<typename T> | |
| class Alignment { | |
| public: | |
| Alignment(std::unique_ptr<AlignmentUtil<T>> util, double gap_penalty, int warping_band) | |
| : util(std::move(util)), gap_penalty(gap_penalty), warping_band(warping_band) {} | |
| double align(const Sequence<T> &x, const Sequence<T> &y, Sequence<T> &aligned_sequence) const; | |
| private: | |
| std::unique_ptr<AlignmentUtil<T>> util; | |
| double gap_penalty; | |
| int warping_band; | |
| }; | |
| std::tuple<double, int> max3(double up, double diag, double left) { | |
| double max_val = diag; | |
| int max_dir = DIAG; | |
| if (up > max_val) { | |
| max_val = up; | |
| max_dir = UP; | |
| } | |
| if (left > max_val) { | |
| max_val = left; | |
| max_dir = LEFT; | |
| } | |
| return std::make_tuple(max_val, max_dir); | |
| } | |
| template<typename T> | |
| double Alignment<T>::align(const Sequence<T> &x, | |
| const Sequence<T> &y, | |
| Sequence<T> &aligned_sequence) const { | |
| int n = x.size(); | |
| int m = y.size(); | |
| int min_band = std::abs(n - m) + 2; | |
| int w = std::max(this->warping_band, min_band); | |
| // DP table and traceback table | |
| // Initialize all cells to -infinity (cells outside band should never be used) | |
| std::vector<double> dp((n + 1) * (m + 1), -std::numeric_limits<double>::infinity()); | |
| std::vector<int> tb((n + 1) * (m + 1), -1); | |
| // 2D index to 1D array mapping | |
| auto idx = [m](int i, int j) { | |
| return i * (m + 1) + j; | |
| }; | |
| // Initialize origin | |
| dp[idx(0, 0)] = 0.0; | |
| tb[idx(0, 0)] = -1; | |
| // Initialize boundaries | |
| for(int i = 1; i < n + 1; i++) { | |
| dp[idx(i, 0)] = -1 * i * gap_penalty; | |
| tb[idx(i, 0)] = UP; | |
| } | |
| for(int j = 1; j < m + 1; j++) { | |
| dp[idx(0, j)] = -1 * j * gap_penalty; | |
| tb[idx(0, j)] = LEFT; | |
| } | |
| // Fill DP table with warping band | |
| for(int i = 1; i < n + 1; i++) { | |
| int start = std::max(1, i - w); | |
| int stop = std::min(i + w, m + 1); | |
| for(int j = start; j < stop; j++) { | |
| double match_score = this->util->score(x[i - 1], y[j - 1]); | |
| auto [alignment_score, trace] = max3( | |
| dp[idx(i-1, j)] - gap_penalty, | |
| dp[idx(i-1, j-1)] + match_score, | |
| dp[idx(i, j-1)] - gap_penalty | |
| ); | |
| dp[idx(i, j)] = alignment_score; | |
| tb[idx(i, j)] = trace; | |
| } | |
| } | |
| // Traceback to build alignment | |
| int i = n; | |
| int j = m; | |
| while(i > 0 || j > 0) { | |
| int trace = tb[idx(i, j)]; | |
| if(trace == DIAG && i > 0 && j > 0) { | |
| // Match/mismatch: take from both sequences | |
| const auto& x_col = x[i - 1]; | |
| const auto& y_col = y[j - 1]; | |
| Column<T> col; | |
| col.reserve(x_col.size() + y_col.size()); | |
| col.insert(col.end(), x_col.begin(), x_col.end()); | |
| col.insert(col.end(), y_col.begin(), y_col.end()); | |
| aligned_sequence.push_back(std::move(col)); | |
| i -= 1; | |
| j -= 1; | |
| } else if(trace == UP || (i > 0 && j == 0)) { | |
| // Gap in y: take from x only | |
| const auto& x_col = x[i - 1]; | |
| int y_count = (j > 0) ? y[j - 1].size() : ((m > 0) ? y[0].size() : 0); | |
| Column<T> col; | |
| col.reserve(x_col.size() + y_count); | |
| col.insert(col.end(), x_col.begin(), x_col.end()); | |
| // Add gaps for y sequences | |
| T gap = this->util->make_gap(); | |
| for(int yj = 0; yj < y_count; yj++) { | |
| col.push_back(gap); | |
| } | |
| aligned_sequence.push_back(std::move(col)); | |
| i -= 1; | |
| } else if(trace == LEFT || (j > 0 && i == 0)) { | |
| // Gap in x: take from y only | |
| const auto& y_col = y[j - 1]; | |
| int x_count = (i > 0) ? x[i - 1].size() : ((n > 0) ? x[0].size() : 0); | |
| Column<T> col; | |
| col.reserve(x_count + y_col.size()); | |
| // Add gaps for x sequences | |
| T gap = this->util->make_gap(); | |
| for(int xi = 0; xi < x_count; xi++) { | |
| col.push_back(gap); | |
| } | |
| col.insert(col.end(), y_col.begin(), y_col.end()); | |
| aligned_sequence.push_back(std::move(col)); | |
| j -= 1; | |
| } else { | |
| // Shouldn't reach here if tb is properly initialized | |
| break; | |
| } | |
| } | |
| // Reverse alignment since we built it backwards | |
| std::reverse(aligned_sequence.begin(), aligned_sequence.end()); | |
| return dp[idx(n, m)]; | |
| } | |
| int main(int argc, char** argv) { | |
| // Test the optimized scoring function | |
| SymbolicAlignmentUtil util(4, 1.0); | |
| // Test case 1: Identical columns | |
| Column<Symbol> col1 = {0, 1, 2}; | |
| Column<Symbol> col2 = {0, 1, 2}; | |
| double score1 = util.score(col1, col2); | |
| // Test case 2: Different columns | |
| Column<Symbol> col3 = {0, 0, 1}; | |
| Column<Symbol> col4 = {1, 2, 2}; | |
| double score2 = util.score(col3, col4); | |
| // Test case 3: With gaps | |
| Column<Symbol> col5 = {-1, 0, 1}; | |
| Column<Symbol> col6 = {0, 1, 2}; | |
| double score3 = util.score(col5, col6); | |
| // Print results for verification | |
| printf("=== Scoring Function Tests ===\n"); | |
| printf("Test 1 (identical): %.6f\n", score1); | |
| printf("Test 2 (different): %.6f\n", score2); | |
| printf("Test 3 (with gaps): %.6f\n", score3); | |
| // Test MSA alignment | |
| printf("\n=== MSA Alignment Test ===\n"); | |
| // Create two simple sequences to align | |
| // Sequence X: A-C-T | |
| SymbolicSequence seq_x = { | |
| {0}, // Column with symbol A (0) | |
| {1}, // Column with symbol C (1) | |
| {2} // Column with symbol T (2) | |
| }; | |
| // Sequence Y: A-G-C-T | |
| SymbolicSequence seq_y = { | |
| {0}, // Column with symbol A (0) | |
| {3}, // Column with symbol G (3) | |
| {1}, // Column with symbol C (1) | |
| {2} // Column with symbol T (2) | |
| }; | |
| // Create alignment utility and alignment object | |
| auto align_util = std::make_unique<SymbolicAlignmentUtil>(4, 1.0); | |
| Alignment<Symbol> aligner(std::move(align_util), 0.5, 10); | |
| // Perform alignment | |
| SymbolicSequence aligned; | |
| double alignment_score = aligner.align(seq_x, seq_y, aligned); | |
| printf("Input sequence X (length %zu): ", seq_x.size()); | |
| for (const auto& col : seq_x) { | |
| printf("%d ", col[0]); | |
| } | |
| printf("\n"); | |
| printf("Input sequence Y (length %zu): ", seq_y.size()); | |
| for (const auto& col : seq_y) { | |
| printf("%d ", col[0]); | |
| } | |
| printf("\n"); | |
| printf("Alignment score: %.6f\n", alignment_score); | |
| printf("Aligned sequence (length %zu):\n", aligned.size()); | |
| for (size_t i = 0; i < aligned.size(); i++) { | |
| printf(" Position %zu: [", i); | |
| for (size_t j = 0; j < aligned[i].size(); j++) { | |
| if (aligned[i][j] == SYMBOLIC_GAP) { | |
| printf("GAP"); | |
| } else { | |
| printf("%d", aligned[i][j]); | |
| } | |
| if (j < aligned[i].size() - 1) printf(", "); | |
| } | |
| printf("]\n"); | |
| } | |
| // Test MSA-to-MSA alignment | |
| printf("\n=== MSA-to-MSA Alignment Test ===\n"); | |
| printf("Aligning two multiple sequence alignments\n\n"); | |
| // MSA 1: Contains 3 aligned sequences | |
| // Position 0: [A, A, A] - all three sequences have A | |
| // Position 1: [C, C, T] - two C's and one T | |
| // Position 2: [G, G, G] - all three sequences have G | |
| SymbolicSequence msa1 = { | |
| {0, 0, 0}, // All A's | |
| {1, 1, 2}, // Two C's and one T | |
| {3, 3, 3} // All G's | |
| }; | |
| // MSA 2: Contains 2 aligned sequences | |
| // Position 0: [A, A] - both have A | |
| // Position 1: [GAP, C] - first has gap, second has C | |
| // Position 2: [T, T] - both have T | |
| // Position 3: [G, G] - both have G | |
| SymbolicSequence msa2 = { | |
| {0, 0}, // Both A's | |
| {-1, 1}, // Gap and C | |
| {2, 2}, // Both T's | |
| {3, 3} // Both G's | |
| }; | |
| printf("MSA 1 (3 sequences, length %zu):\n", msa1.size()); | |
| for (size_t i = 0; i < msa1.size(); i++) { | |
| printf(" Column %zu: [", i); | |
| for (size_t j = 0; j < msa1[i].size(); j++) { | |
| if (msa1[i][j] == SYMBOLIC_GAP) { | |
| printf("GAP"); | |
| } else { | |
| printf("%d", msa1[i][j]); | |
| } | |
| if (j < msa1[i].size() - 1) printf(", "); | |
| } | |
| printf("]\n"); | |
| } | |
| printf("\nMSA 2 (2 sequences, length %zu):\n", msa2.size()); | |
| for (size_t i = 0; i < msa2.size(); i++) { | |
| printf(" Column %zu: [", i); | |
| for (size_t j = 0; j < msa2[i].size(); j++) { | |
| if (msa2[i][j] == SYMBOLIC_GAP) { | |
| printf("GAP"); | |
| } else { | |
| printf("%d", msa2[i][j]); | |
| } | |
| if (j < msa2[i].size() - 1) printf(", "); | |
| } | |
| printf("]\n"); | |
| } | |
| // Align the two MSAs | |
| auto msa_align_util = std::make_unique<SymbolicAlignmentUtil>(4, 1.0); | |
| Alignment<Symbol> msa_aligner(std::move(msa_align_util), 0.5, 10); | |
| SymbolicSequence aligned_msa; | |
| double msa_score = msa_aligner.align(msa1, msa2, aligned_msa); | |
| printf("\nMSA-to-MSA alignment score: %.6f\n", msa_score); | |
| printf("Aligned MSA (combines all 5 sequences, length %zu):\n", aligned_msa.size()); | |
| for (size_t i = 0; i < aligned_msa.size(); i++) { | |
| printf(" Column %zu: [", i); | |
| for (size_t j = 0; j < aligned_msa[i].size(); j++) { | |
| if (aligned_msa[i][j] == SYMBOLIC_GAP) { | |
| printf("GAP"); | |
| } else { | |
| printf("%d", aligned_msa[i][j]); | |
| } | |
| if (j < aligned_msa[i].size() - 1) printf(", "); | |
| } | |
| printf("] (%zu total symbols: first 3 from MSA1, last 2 from MSA2)\n", aligned_msa[i].size()); | |
| } | |
| // Test Numeric alignment | |
| printf("\n=== Numeric Vector Alignment Test ===\n"); | |
| printf("Aligning sequences of numeric vectors\n\n"); | |
| // Numeric sequence 1: Three 2D vectors (single sequence) | |
| NumericSequence num_seq1 = { | |
| {{1.0, 2.0}}, // Position 0: vector [1.0, 2.0] | |
| {{3.0, 4.0}}, // Position 1: vector [3.0, 4.0] | |
| {{5.0, 6.0}} // Position 2: vector [5.0, 6.0] | |
| }; | |
| // Numeric sequence 2: Four 2D vectors (slightly different) | |
| NumericSequence num_seq2 = { | |
| {{1.1, 2.1}}, // Position 0: similar to seq1[0] | |
| {{7.0, 8.0}}, // Position 1: different vector | |
| {{3.1, 4.1}}, // Position 2: similar to seq1[1] | |
| {{5.1, 6.1}} // Position 3: similar to seq1[2] | |
| }; | |
| printf("Numeric sequence 1 (length %zu):\n", num_seq1.size()); | |
| for (size_t i = 0; i < num_seq1.size(); i++) { | |
| printf(" Column %zu: [", i); | |
| for (size_t j = 0; j < num_seq1[i].size(); j++) { | |
| printf("["); | |
| for (size_t k = 0; k < num_seq1[i][j].size(); k++) { | |
| printf("%.1f", num_seq1[i][j][k]); | |
| if (k < num_seq1[i][j].size() - 1) printf(", "); | |
| } | |
| printf("]"); | |
| if (j < num_seq1[i].size() - 1) printf(", "); | |
| } | |
| printf("]\n"); | |
| } | |
| printf("\nNumeric sequence 2 (length %zu):\n", num_seq2.size()); | |
| for (size_t i = 0; i < num_seq2.size(); i++) { | |
| printf(" Column %zu: [", i); | |
| for (size_t j = 0; j < num_seq2[i].size(); j++) { | |
| printf("["); | |
| for (size_t k = 0; k < num_seq2[i][j].size(); k++) { | |
| printf("%.1f", num_seq2[i][j][k]); | |
| if (k < num_seq2[i][j].size() - 1) printf(", "); | |
| } | |
| printf("]"); | |
| if (j < num_seq2[i].size() - 1) printf(", "); | |
| } | |
| printf("]\n"); | |
| } | |
| // Create numeric alignment with sigma=1.0 (controls similarity sensitivity) | |
| auto num_align_util = std::make_unique<NumericAlignmentUtil>(1.0, 1.0); | |
| Alignment<Vector> num_aligner(std::move(num_align_util), 0.5, 10); | |
| NumericSequence aligned_num; | |
| double num_score = num_aligner.align(num_seq1, num_seq2, aligned_num); | |
| printf("\nNumeric alignment score: %.6f\n", num_score); | |
| printf("Aligned numeric sequence (length %zu):\n", aligned_num.size()); | |
| for (size_t i = 0; i < aligned_num.size(); i++) { | |
| printf(" Column %zu: [", i); | |
| for (size_t j = 0; j < aligned_num[i].size(); j++) { | |
| if (aligned_num[i][j].empty()) { | |
| printf("GAP"); | |
| } else { | |
| printf("["); | |
| for (size_t k = 0; k < aligned_num[i][j].size(); k++) { | |
| printf("%.1f", aligned_num[i][j][k]); | |
| if (k < aligned_num[i][j].size() - 1) printf(", "); | |
| } | |
| printf("]"); | |
| } | |
| if (j < aligned_num[i].size() - 1) printf(", "); | |
| } | |
| printf("]\n"); | |
| } | |
| // Test Numeric MSA-to-MSA alignment | |
| printf("\n=== Numeric MSA-to-MSA Alignment Test ===\n"); | |
| printf("Aligning two numeric multiple sequence alignments\n\n"); | |
| // Numeric MSA 1: 3 sequences aligned (3 vectors per column) | |
| // Represents 3 feature vector sequences aligned together | |
| NumericSequence num_msa1 = { | |
| {{1.0, 2.0}, {1.1, 2.1}, {1.2, 2.2}}, // Column 0: 3 similar vectors | |
| {{3.0, 4.0}, {3.0, 4.1}, {3.1, 4.0}}, // Column 1: 3 similar vectors | |
| {{5.0, 6.0}, {5.1, 6.0}, {5.0, 6.1}} // Column 2: 3 similar vectors | |
| }; | |
| // Numeric MSA 2: 2 sequences aligned (2 vectors per column) | |
| // One has a gap in the middle | |
| NumericSequence num_msa2 = { | |
| {{1.0, 2.0}, {1.15, 2.05}}, // Column 0: 2 similar vectors | |
| {{}, {3.2, 4.2}}, // Column 1: gap and a vector | |
| {{7.0, 8.0}, {7.1, 8.1}}, // Column 2: different vectors | |
| {{5.0, 6.0}, {5.05, 6.05}} // Column 3: similar to msa1[2] | |
| }; | |
| printf("Numeric MSA 1 (3 sequences, length %zu):\n", num_msa1.size()); | |
| for (size_t i = 0; i < num_msa1.size(); i++) { | |
| printf(" Column %zu: [", i); | |
| for (size_t j = 0; j < num_msa1[i].size(); j++) { | |
| if (num_msa1[i][j].empty()) { | |
| printf("GAP"); | |
| } else { | |
| printf("["); | |
| for (size_t k = 0; k < num_msa1[i][j].size(); k++) { | |
| printf("%.1f", num_msa1[i][j][k]); | |
| if (k < num_msa1[i][j].size() - 1) printf(", "); | |
| } | |
| printf("]"); | |
| } | |
| if (j < num_msa1[i].size() - 1) printf(", "); | |
| } | |
| printf("]\n"); | |
| } | |
| printf("\nNumeric MSA 2 (2 sequences, length %zu):\n", num_msa2.size()); | |
| for (size_t i = 0; i < num_msa2.size(); i++) { | |
| printf(" Column %zu: [", i); | |
| for (size_t j = 0; j < num_msa2[i].size(); j++) { | |
| if (num_msa2[i][j].empty()) { | |
| printf("GAP"); | |
| } else { | |
| printf("["); | |
| for (size_t k = 0; k < num_msa2[i][j].size(); k++) { | |
| printf("%.1f", num_msa2[i][j][k]); | |
| if (k < num_msa2[i][j].size() - 1) printf(", "); | |
| } | |
| printf("]"); | |
| } | |
| if (j < num_msa2[i].size() - 1) printf(", "); | |
| } | |
| printf("]\n"); | |
| } | |
| // Align the two numeric MSAs | |
| auto num_msa_util = std::make_unique<NumericAlignmentUtil>(1.0, 1.0); | |
| Alignment<Vector> num_msa_aligner(std::move(num_msa_util), 0.5, 10); | |
| NumericSequence aligned_num_msa; | |
| double num_msa_score = num_msa_aligner.align(num_msa1, num_msa2, aligned_num_msa); | |
| printf("\nNumeric MSA-to-MSA alignment score: %.6f\n", num_msa_score); | |
| printf("Aligned numeric MSA (combines all 5 sequences, length %zu):\n", aligned_num_msa.size()); | |
| for (size_t i = 0; i < aligned_num_msa.size(); i++) { | |
| printf(" Column %zu: [", i); | |
| for (size_t j = 0; j < aligned_num_msa[i].size(); j++) { | |
| if (aligned_num_msa[i][j].empty()) { | |
| printf("GAP"); | |
| } else { | |
| printf("["); | |
| for (size_t k = 0; k < aligned_num_msa[i][j].size(); k++) { | |
| printf("%.2f", aligned_num_msa[i][j][k]); | |
| if (k < aligned_num_msa[i][j].size() - 1) printf(", "); | |
| } | |
| printf("]"); | |
| } | |
| if (j < aligned_num_msa[i].size() - 1) printf(", "); | |
| } | |
| printf("]\n"); | |
| } | |
| printf("\nInterpretation: Each column contains 5 vectors total:\n"); | |
| printf(" - First 3 vectors from MSA1\n"); | |
| printf(" - Last 2 vectors from MSA2\n"); | |
| printf(" - GAP entries where sequences don't align\n"); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment