Skip to content

Instantly share code, notes, and snippets.

@avilaHugo
Last active November 5, 2021 12:11
Show Gist options
  • Select an option

  • Save avilaHugo/9c2e44eef72aedeba48e7786964c9665 to your computer and use it in GitHub Desktop.

Select an option

Save avilaHugo/9c2e44eef72aedeba48e7786964c9665 to your computer and use it in GitHub Desktop.
Script to count k-mer frequencies.
#!/usr/bin/env python3
import os
from sys import argv
from collections import OrderedDict
from itertools import product
from Bio import SeqIO
def get_word_dict(word_size: int = 1, recursive = True):
"""Get a orderedDict of k-mers with zero values: {"A": 0, "C": 0 ...}
Word size: k-mer length (3-mer = "AAA", "AAC" ...).
recursive: If True will create k-1 k-mers as well: word size = 3 (3-mer, 2-mer, 1-mer).
"""
count_dict = []
if not recursive:
for y in product("ACGT", repeat=word_size):
count_dict.append(''.join(y))
else:
for i in range(1, word_size + 1):
for y in product("ACGT", repeat=i):
count_dict.append(''.join(y))
return OrderedDict.fromkeys(count_dict, 0)
def freq_from_dict(nucleotide_dict, record):
"""Get frequence of k-mers in a biopython SeqIO object.
For k > 1, the count is overlapped: 2-mer and record="AAATGC" -> AA=2, AT=1, TG=1, GC=1.
frequence = pattern/sum of all same length patterns. Ex: AA/(AA+AC+AG+AT+CA+CC+CG+CT+GA+GC+GG+GT+TA+TC+TG+TT)
"""
for nucleotide in nucleotide_dict:
if len(nucleotide_dict) == 1:
nucleotide_dict[nucleotide] += record.seq.count(nucleotide)
else:
nucleotide_dict[nucleotide] += record.seq.count_overlap(nucleotide)
sum_dict = {}
for nucleotide in nucleotide_dict:
length_kmer = len(nucleotide)
if not length_kmer in sum_dict:
sum_dict[length_kmer] = sum([y for i,y in nucleotide_dict.items() if len(i) == length_kmer])
for nucleotide in nucleotide_dict:
nucleotide_dict[nucleotide] = nucleotide_dict[nucleotide]/sum_dict[len(nucleotide)]
return nucleotide_dict
def main(input_path: str, output_path: str, word_size: int = 1, recursive = True) -> None:
""" Reads a fasta file and outputs a csv with fasta ids and k-mer frequencies.
input_path and output_path: strings with paths.
Word size: k-mer length (3-mer = "AAA", "AAC" ...).
recursive: If True will create k-1 k-mers as well: word size = 3 (3-mer, 2-mer, 1-mer).
"""
result = []
with open(input_path, 'r') as input_handle:
for record in SeqIO.parse(input_handle, 'fasta'):
freq_dict = freq_from_dict(get_word_dict(word_size=word_size, recursive=recursive), record=record)
result.append('{},{}\n'.format(record.id, ','.join(map(str ,freq_dict.values()))))
with open(output_path, 'w') as output_handle:
output_handle.write('records,{}\n'.format(','.join(freq_dict.keys())))
for i in result:
output_handle.write(i)
if __name__ == "__main__":
main(
input_path=argv[1],
output_path=argv[2],
word_size=(int(argv[3])),
recursive=(True if argv[4] else False)
)
@avilaHugo
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment