Skip to content

Instantly share code, notes, and snippets.

@Hoohm
Created August 10, 2018 15:32
Show Gist options
  • Select an option

  • Save Hoohm/ddf7754b1eed8ccf2c556b7969799c3a to your computer and use it in GitHub Desktop.

Select an option

Save Hoohm/ddf7754b1eed8ccf2c556b7969799c3a to your computer and use it in GitHub Desktop.
Python script that generates a fuzzy regex for capturing barcodes based on a given string pattern
import string
import regex
import numpy as np
test = 'AGCTAGCTGANNNNNAGCTANNNNNAGCTAGCTAGXXXXXXXXXX'
def generate_regex(barcode_template, positions, number_mismatches):
final_regex = '(?:'
position = 0
cell_barcode_id = 0
umi_barcode_id = 0
while(position < len(barcode_template)):
if barcode_template[position] in 'ATGC':
final_regex += barcode_template[position]
position +=1
else:
#print(positions['cell_barcode'])
for pair in positions['cell_barcode']:
if position in pair:
final_regex += '(?P<cell_barcode{}>[ATGC]{{{}}})'.format(cell_barcode_id, pair[1]-pair[0]+1)
position += pair[1]-pair[0]+1
cell_barcode_id +=1
for pair in positions['umi_barcode']:
if position in pair:
final_regex += '(?P<umi_barcode{}>[ATGC]{{{}}})'.format(umi_barcode_id, pair[1]-pair[0]+1)
position += pair[1]-pair[0]+1
umi_barcode_id +=1
final_regex += '){{s<={}}}'.format(number_mismatches)
return(final_regex)
def consecutive(data, stepsize=1):
split = np.split(data, np.where(np.diff(data) != stepsize)[0]+1)
final_indexes=[]
for sub in split:
final_indexes.append([sub[0],sub[-1]])
return(final_indexes)
def get_barcodes_positions(barcode_template, barcode_id, umi_id):
barcodes = [pos for pos, char in enumerate(test) if char == barcode_id]
UMI = [pos for pos, char in enumerate(test) if char == umi_id]
positions = dict()
positions['cell_barcode'] = consecutive(barcodes)
positions['umi_barcode'] = consecutive(UMI)
return(positions)
positions = get_barcodes_positions(test, barcode_id='N', umi_id='X')
regex_string = generate_regex(test, positions, 3)
regex_compiled = regex.compile(regex_string)
print('AGCTAGCTGAATCGAAGCTAAGCTAAGCTAGCTAGGGCAGTCGAT')
print(regex.findall(regex_compiled, 'AGCTAGCTGAATCGAAGCTAAGCTAAGCTAGCTAGGGCAGTCGAT'))
print(regex.findall(regex_compiled, 'ACCTAGCTGAATCGAAGCTAAGCTAAGCTAGCTAGGGCAGTCGAT'))
print(regex.findall(regex_compiled, 'ACATAGCTGAATCGAAGCTAAGCTAAGCTAGCTAGGGCAGTCGAT'))
print(regex.findall(regex_compiled, 'ACAAAGCTGAATCGAAGCTAAGCTAAGCTAGCTAGGGCAGTCGAT'))
print(regex.findall(regex_compiled, 'CCAAAGCTGAATCGAAGCTAAGCTAAGCTAGCTAGGGCAGTCGAT'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment