Last active
November 5, 2021 12:11
-
-
Save avilaHugo/9c2e44eef72aedeba48e7786964c9665 to your computer and use it in GitHub Desktop.
Script to count k-mer frequencies.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import os | |
| from sys import argv | |
| from collections import OrderedDict | |
| from itertools import product | |
| from Bio import SeqIO | |
| def get_word_dict(word_size: int = 1, recursive = True): | |
| """Get a orderedDict of k-mers with zero values: {"A": 0, "C": 0 ...} | |
| Word size: k-mer length (3-mer = "AAA", "AAC" ...). | |
| recursive: If True will create k-1 k-mers as well: word size = 3 (3-mer, 2-mer, 1-mer). | |
| """ | |
| count_dict = [] | |
| if not recursive: | |
| for y in product("ACGT", repeat=word_size): | |
| count_dict.append(''.join(y)) | |
| else: | |
| for i in range(1, word_size + 1): | |
| for y in product("ACGT", repeat=i): | |
| count_dict.append(''.join(y)) | |
| return OrderedDict.fromkeys(count_dict, 0) | |
| def freq_from_dict(nucleotide_dict, record): | |
| """Get frequence of k-mers in a biopython SeqIO object. | |
| For k > 1, the count is overlapped: 2-mer and record="AAATGC" -> AA=2, AT=1, TG=1, GC=1. | |
| frequence = pattern/sum of all same length patterns. Ex: AA/(AA+AC+AG+AT+CA+CC+CG+CT+GA+GC+GG+GT+TA+TC+TG+TT) | |
| """ | |
| for nucleotide in nucleotide_dict: | |
| if len(nucleotide_dict) == 1: | |
| nucleotide_dict[nucleotide] += record.seq.count(nucleotide) | |
| else: | |
| nucleotide_dict[nucleotide] += record.seq.count_overlap(nucleotide) | |
| sum_dict = {} | |
| for nucleotide in nucleotide_dict: | |
| length_kmer = len(nucleotide) | |
| if not length_kmer in sum_dict: | |
| sum_dict[length_kmer] = sum([y for i,y in nucleotide_dict.items() if len(i) == length_kmer]) | |
| for nucleotide in nucleotide_dict: | |
| nucleotide_dict[nucleotide] = nucleotide_dict[nucleotide]/sum_dict[len(nucleotide)] | |
| return nucleotide_dict | |
| def main(input_path: str, output_path: str, word_size: int = 1, recursive = True) -> None: | |
| """ Reads a fasta file and outputs a csv with fasta ids and k-mer frequencies. | |
| input_path and output_path: strings with paths. | |
| Word size: k-mer length (3-mer = "AAA", "AAC" ...). | |
| recursive: If True will create k-1 k-mers as well: word size = 3 (3-mer, 2-mer, 1-mer). | |
| """ | |
| result = [] | |
| with open(input_path, 'r') as input_handle: | |
| for record in SeqIO.parse(input_handle, 'fasta'): | |
| freq_dict = freq_from_dict(get_word_dict(word_size=word_size, recursive=recursive), record=record) | |
| result.append('{},{}\n'.format(record.id, ','.join(map(str ,freq_dict.values())))) | |
| with open(output_path, 'w') as output_handle: | |
| output_handle.write('records,{}\n'.format(','.join(freq_dict.keys()))) | |
| for i in result: | |
| output_handle.write(i) | |
| if __name__ == "__main__": | |
| main( | |
| input_path=argv[1], | |
| output_path=argv[2], | |
| word_size=(int(argv[3])), | |
| recursive=(True if argv[4] else False) | |
| ) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Original Biostars post: https://www.biostars.org/p/9462180/#9496424.