avilaHugo/get_nucl_freq_from_fasta.py

## get_nucl_freq_from_fasta.py
#!/usr/bin/env python3

import os
from sys import argv
from collections import OrderedDict
from itertools import product
from Bio import SeqIO

def get_word_dict(word_size: int = 1, recursive = True):
	"""Get a orderedDict of k-mers with zero values: {"A": 0, "C": 0 ...}
	Word size: k-mer length (3-mer = "AAA", "AAC" ...).
	recursive: If True will create k-1 k-mers as well: word size = 3 (3-mer, 2-mer, 1-mer).
	"""
    count_dict = []
    if not recursive:
        for y in product("ACGT", repeat=word_size):
            count_dict.append(''.join(y))
    else:
        for i in range(1, word_size + 1):
            for y in product("ACGT", repeat=i):
                count_dict.append(''.join(y))
    return OrderedDict.fromkeys(count_dict, 0)

def freq_from_dict(nucleotide_dict, record):
	"""Get frequence of k-mers in a biopython SeqIO object.
	For k > 1, the count is overlapped: 2-mer and record="AAATGC" -> AA=2, AT=1, TG=1, GC=1.
	frequence = pattern/sum of all same length patterns. Ex: AA/(AA+AC+AG+AT+CA+CC+CG+CT+GA+GC+GG+GT+TA+TC+TG+TT)
	"""
    for nucleotide in nucleotide_dict:
        if len(nucleotide_dict) == 1:
            nucleotide_dict[nucleotide] += record.seq.count(nucleotide)
        else:
            nucleotide_dict[nucleotide] += record.seq.count_overlap(nucleotide)
    sum_dict = {}
    for nucleotide in nucleotide_dict:
        length_kmer = len(nucleotide)
        if not length_kmer in sum_dict:
            sum_dict[length_kmer] = sum([y for i,y in nucleotide_dict.items() if len(i) == length_kmer])
    for nucleotide in nucleotide_dict:
        nucleotide_dict[nucleotide] = nucleotide_dict[nucleotide]/sum_dict[len(nucleotide)]
    return nucleotide_dict

def main(input_path: str, output_path: str, word_size: int = 1, recursive = True) -> None:
	""" Reads a fasta file and outputs a csv with fasta ids and k-mer frequencies.
	input_path and output_path: strings with paths.
	Word size: k-mer length (3-mer = "AAA", "AAC" ...).
	recursive: If True will create k-1 k-mers as well: word size = 3 (3-mer, 2-mer, 1-mer).
	"""
    result = []
    with open(input_path, 'r') as input_handle:
        for record in SeqIO.parse(input_handle, 'fasta'):
            freq_dict = freq_from_dict(get_word_dict(word_size=word_size, recursive=recursive), record=record)
            result.append('{},{}\n'.format(record.id, ','.join(map(str ,freq_dict.values()))))
    with open(output_path, 'w') as output_handle:
        output_handle.write('records,{}\n'.format(','.join(freq_dict.keys())))
        for i in result:
            output_handle.write(i)


if __name__ == "__main__":
    main(
        input_path=argv[1],
        output_path=argv[2],
        word_size=(int(argv[3])),
        recursive=(True if argv[4] else False)
    )
	#!/usr/bin/env python3

	import os
	from sys import argv
	from collections import OrderedDict
	from itertools import product
	from Bio import SeqIO

	def get_word_dict(word_size: int = 1, recursive = True):
	"""Get a orderedDict of k-mers with zero values: {"A": 0, "C": 0 ...}
	Word size: k-mer length (3-mer = "AAA", "AAC" ...).
	recursive: If True will create k-1 k-mers as well: word size = 3 (3-mer, 2-mer, 1-mer).
	"""
	count_dict = []
	if not recursive:
	for y in product("ACGT", repeat=word_size):
	count_dict.append(''.join(y))
	else:
	for i in range(1, word_size + 1):
	for y in product("ACGT", repeat=i):
	count_dict.append(''.join(y))
	return OrderedDict.fromkeys(count_dict, 0)

	def freq_from_dict(nucleotide_dict, record):
	"""Get frequence of k-mers in a biopython SeqIO object.
	For k > 1, the count is overlapped: 2-mer and record="AAATGC" -> AA=2, AT=1, TG=1, GC=1.
	frequence = pattern/sum of all same length patterns. Ex: AA/(AA+AC+AG+AT+CA+CC+CG+CT+GA+GC+GG+GT+TA+TC+TG+TT)
	"""
	for nucleotide in nucleotide_dict:
	if len(nucleotide_dict) == 1:
	nucleotide_dict[nucleotide] += record.seq.count(nucleotide)
	else:
	nucleotide_dict[nucleotide] += record.seq.count_overlap(nucleotide)
	sum_dict = {}
	for nucleotide in nucleotide_dict:
	length_kmer = len(nucleotide)
	if not length_kmer in sum_dict:
	sum_dict[length_kmer] = sum([y for i,y in nucleotide_dict.items() if len(i) == length_kmer])
	for nucleotide in nucleotide_dict:
	nucleotide_dict[nucleotide] = nucleotide_dict[nucleotide]/sum_dict[len(nucleotide)]
	return nucleotide_dict

	def main(input_path: str, output_path: str, word_size: int = 1, recursive = True) -> None:
	""" Reads a fasta file and outputs a csv with fasta ids and k-mer frequencies.
	input_path and output_path: strings with paths.
	Word size: k-mer length (3-mer = "AAA", "AAC" ...).
	recursive: If True will create k-1 k-mers as well: word size = 3 (3-mer, 2-mer, 1-mer).
	"""
	result = []
	with open(input_path, 'r') as input_handle:
	for record in SeqIO.parse(input_handle, 'fasta'):
	freq_dict = freq_from_dict(get_word_dict(word_size=word_size, recursive=recursive), record=record)
	result.append('{},{}\n'.format(record.id, ','.join(map(str ,freq_dict.values()))))
	with open(output_path, 'w') as output_handle:
	output_handle.write('records,{}\n'.format(','.join(freq_dict.keys())))
	for i in result:
	output_handle.write(i)


	if __name__ == "__main__":
	main(
	input_path=argv[1],
	output_path=argv[2],
	word_size=(int(argv[3])),
	recursive=(True if argv[4] else False)
	)
No results found